akhaliq HF Staff commited on
Commit
da1592a
·
verified ·
1 Parent(s): 9bb6072

Upload index.js with huggingface_hub

Browse files
Files changed (1) hide show
  1. index.js +369 -66
index.js CHANGED
@@ -1,76 +1,379 @@
1
- import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/[email protected]';
2
-
3
- // Reference the elements that we will need
4
- const status = document.getElementById('status');
5
- const fileUpload = document.getElementById('upload');
6
- const imageContainer = document.getElementById('container');
7
- const example = document.getElementById('example');
8
-
9
- const EXAMPLE_URL = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/city-streets.jpg';
10
-
11
- // Create a new object detection pipeline
12
- status.textContent = 'Loading model...';
13
- const detector = await pipeline('object-detection', 'Xenova/detr-resnet-50');
14
- status.textContent = 'Ready';
15
-
16
- example.addEventListener('click', (e) => {
17
- e.preventDefault();
18
- detect(EXAMPLE_URL);
19
- });
20
-
21
- fileUpload.addEventListener('change', function (e) {
22
- const file = e.target.files[0];
23
- if (!file) {
24
- return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  }
 
 
26
 
27
- const reader = new FileReader();
 
 
 
 
 
 
 
 
 
28
 
29
- // Set up a callback when the file is loaded
30
- reader.onload = e2 => detect(e2.target.result);
 
 
 
 
31
 
32
- reader.readAsDataURL(file);
33
- });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
 
 
 
 
 
35
 
36
- // Detect objects in the image
37
- async function detect(img) {
38
- imageContainer.innerHTML = '';
39
- imageContainer.style.backgroundImage = `url(${img})`;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- status.textContent = 'Analysing...';
42
- const output = await detector(img, {
43
- threshold: 0.5,
44
- percentage: true,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  });
46
- status.textContent = '';
47
- output.forEach(renderBox);
48
- }
 
 
 
 
 
49
 
50
- // Render a bounding box and label on the image
51
- function renderBox({ box, label }) {
52
- const { xmax, xmin, ymax, ymin } = box;
53
-
54
- // Generate a random color for the box
55
- const color = '#' + Math.floor(Math.random() * 0xFFFFFF).toString(16).padStart(6, 0);
56
-
57
- // Draw the box
58
- const boxElement = document.createElement('div');
59
- boxElement.className = 'bounding-box';
60
- Object.assign(boxElement.style, {
61
- borderColor: color,
62
- left: 100 * xmin + '%',
63
- top: 100 * ymin + '%',
64
- width: 100 * (xmax - xmin) + '%',
65
- height: 100 * (ymax - ymin) + '%',
66
- })
67
-
68
- // Draw label
69
- const labelElement = document.createElement('span');
70
- labelElement.textContent = label;
71
- labelElement.className = 'bounding-box-label';
72
- labelElement.style.backgroundColor = color;
73
-
74
- boxElement.appendChild(labelElement);
75
- imageContainer.appendChild(boxElement);
76
  }
 
 
 
 
 
 
1
+ import {
2
+ AutoProcessor,
3
+ AutoModelForImageTextToText,
4
+ TextStreamer,
5
+ } from 'https://cdn.jsdelivr.net/npm/@huggingface/[email protected]';
6
+
7
+ class VideoCaptionApp {
8
+ constructor() {
9
+ this.videoFile = null;
10
+ this.videoUrl = null;
11
+ this.processor = null;
12
+ this.model = null;
13
+ this.useGPU = false;
14
+ this.isProcessing = false;
15
+
16
+ this.initializeElements();
17
+ this.bindEvents();
18
+ this.checkWebGPUSupport();
19
+ }
20
+
21
+ initializeElements() {
22
+ this.elements = {
23
+ videoPlayer: document.getElementById('videoPlayer'),
24
+ fileInput: document.getElementById('fileInput'),
25
+ uploadBtn: document.getElementById('uploadBtn'),
26
+ analyzeBtn: document.getElementById('analyzeBtn'),
27
+ captionPanel: document.getElementById('captionPanel'),
28
+ closePanel: document.getElementById('closePanel'),
29
+ loadingOverlay: document.getElementById('loadingOverlay'),
30
+ loadingText: document.getElementById('loadingText'),
31
+ progressFill: document.getElementById('progressFill'),
32
+ resultsContainer: document.getElementById('resultsContainer'),
33
+ frameCount: document.getElementById('frameCount'),
34
+ frameCountValue: document.getElementById('frameCountValue'),
35
+ maxDuration: document.getElementById('maxDuration'),
36
+ maxDurationValue: document.getElementById('maxDurationValue'),
37
+ toggleDevice: document.getElementById('toggleDevice'),
38
+ videoPlaceholder: document.querySelector('.video-placeholder')
39
+ };
40
+ }
41
+
42
+ bindEvents() {
43
+ this.elements.uploadBtn.addEventListener('click', () => {
44
+ this.elements.fileInput.click();
45
+ });
46
+
47
+ this.elements.fileInput.addEventListener('change', (e) => {
48
+ this.handleFileUpload(e);
49
+ });
50
+
51
+ this.elements.analyzeBtn.addEventListener('click', () => {
52
+ this.analyzeVideo();
53
+ });
54
+
55
+ this.elements.closePanel.addEventListener('click', () => {
56
+ this.elements.captionPanel.classList.remove('active');
57
+ });
58
+
59
+ this.elements.frameCount.addEventListener('input', (e) => {
60
+ this.elements.frameCountValue.textContent = e.target.value;
61
+ });
62
+
63
+ this.elements.maxDuration.addEventListener('input', (e) => {
64
+ this.elements.maxDurationValue.textContent = e.target.value;
65
+ });
66
+
67
+ this.elements.toggleDevice.addEventListener('click', () => {
68
+ this.toggleDevice();
69
+ });
70
+
71
+ // Drag and drop support
72
+ const app = document.getElementById('app');
73
+ app.addEventListener('dragover', (e) => {
74
+ e.preventDefault();
75
+ app.classList.add('dragging');
76
+ });
77
+
78
+ app.addEventListener('dragleave', () => {
79
+ app.classList.remove('dragging');
80
+ });
81
+
82
+ app.addEventListener('drop', (e) => {
83
+ e.preventDefault();
84
+ app.classList.remove('dragging');
85
+ const files = e.dataTransfer.files;
86
+ if (files.length > 0 && files[0].type.startsWith('video/')) {
87
+ this.loadVideo(files[0]);
88
+ }
89
+ });
90
+ }
91
+
92
+ async checkWebGPUSupport() {
93
+ if ('gpu' in navigator) {
94
+ try {
95
+ const adapter = await navigator.gpu.requestAdapter();
96
+ if (adapter) {
97
+ this.elements.toggleDevice.style.display = 'flex';
98
+ return;
99
+ }
100
+ } catch (e) {
101
+ console.log('WebGPU not supported:', e);
102
+ }
103
  }
104
+ this.elements.toggleDevice.style.display = 'none';
105
+ }
106
 
107
+ toggleDevice() {
108
+ this.useGPU = !this.useGPU;
109
+ const deviceLabel = this.elements.toggleDevice.querySelector('.device-label');
110
+ deviceLabel.textContent = this.useGPU ? 'GPU' : 'CPU';
111
+ this.elements.toggleDevice.classList.toggle('gpu-active', this.useGPU);
112
+
113
+ // Reset model to force reload with new device
114
+ this.model = null;
115
+ this.processor = null;
116
+ }
117
 
118
+ handleFileUpload(event) {
119
+ const file = event.target.files[0];
120
+ if (file && file.type.startsWith('video/')) {
121
+ this.loadVideo(file);
122
+ }
123
+ }
124
 
125
+ loadVideo(file) {
126
+ this.videoFile = file;
127
+ this.videoUrl = URL.createObjectURL(file);
128
+
129
+ this.elements.videoPlayer.src = this.videoUrl;
130
+ this.elements.videoPlayer.style.display = 'block';
131
+ this.elements.videoPlaceholder.style.display = 'none';
132
+ this.elements.captionPanel.classList.add('active');
133
+
134
+ this.elements.videoPlayer.addEventListener('loadedmetadata', () => {
135
+ console.log('Video loaded:', {
136
+ duration: this.elements.videoPlayer.duration,
137
+ width: this.elements.videoPlayer.videoWidth,
138
+ height: this.elements.videoPlayer.videoHeight
139
+ });
140
+ });
141
+ }
142
 
143
+ async extractFramesFromVideo(options = {}) {
144
+ const {
145
+ frameCount = 8,
146
+ maxDuration = 10,
147
+ } = options;
148
 
149
+ return new Promise((resolve, reject) => {
150
+ const video = document.createElement('video');
151
+ video.crossOrigin = 'anonymous';
152
+ video.muted = true;
153
+
154
+ video.addEventListener('loadedmetadata', async () => {
155
+ const duration = Math.min(video.duration, maxDuration);
156
+ const interval = duration / frameCount;
157
+ const frames = [];
158
+
159
+ const canvas = document.createElement('canvas');
160
+ const ctx = canvas.getContext('2d');
161
+
162
+ canvas.width = Math.min(video.videoWidth, 640);
163
+ canvas.height = Math.min(video.videoHeight, 480);
164
+
165
+ for (let i = 0; i < frameCount; i++) {
166
+ const time = i * interval;
167
+ video.currentTime = time;
168
+
169
+ await new Promise(resolve => {
170
+ video.addEventListener('seeked', () => {
171
+ ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
172
+
173
+ canvas.toBlob((blob) => {
174
+ frames.push({
175
+ time: time.toFixed(2),
176
+ blob
177
+ });
178
+ this.updateProgress((i + 1) / frameCount * 20);
179
+ resolve();
180
+ }, 'image/jpeg', 0.8);
181
+ }, { once: true });
182
+ });
183
+ }
184
+
185
+ resolve(frames);
186
+ });
187
+
188
+ video.addEventListener('error', (e) => {
189
+ reject(new Error(`Failed to load video: ${e.message}`));
190
+ });
191
+
192
+ video.src = this.videoUrl;
193
+ video.load();
194
+ });
195
+ }
196
 
197
+ async loadImageFromSource(blob) {
198
+ const bitmap = await createImageBitmap(blob);
199
+ return bitmap;
200
+ }
201
+
202
+ updateProgress(percentage) {
203
+ this.elements.progressFill.style.width = `${percentage}%`;
204
+ }
205
+
206
+ async initializeModel() {
207
+ if (!this.model || !this.processor) {
208
+ this.elements.loadingText.textContent = 'Loading AI model...';
209
+
210
+ const model_id = "onnx-community/FastVLM-0.5B-ONNX";
211
+
212
+ this.processor = await AutoProcessor.from_pretrained(model_id);
213
+
214
+ const deviceConfig = this.useGPU ? { device: 'webgpu' } : {};
215
+
216
+ this.model = await AutoModelForImageTextToText.from_pretrained(model_id, {
217
+ ...deviceConfig,
218
+ dtype: {
219
+ embed_tokens: "fp16",
220
+ vision_encoder: "q4",
221
+ decoder_model_merged: "q4",
222
+ },
223
+ });
224
+
225
+ this.updateProgress(30);
226
+ }
227
+ }
228
+
229
+ async analyzeVideo() {
230
+ if (this.isProcessing || !this.videoUrl) return;
231
+
232
+ this.isProcessing = true;
233
+ this.elements.loadingOverlay.classList.add('active');
234
+ this.elements.analyzeBtn.disabled = true;
235
+ this.elements.resultsContainer.innerHTML = '';
236
+ this.updateProgress(0);
237
+
238
+ try {
239
+ await this.initializeModel();
240
+
241
+ const frameCount = parseInt(this.elements.frameCount.value);
242
+ const maxDuration = parseInt(this.elements.maxDuration.value);
243
+
244
+ this.elements.loadingText.textContent = 'Extracting frames...';
245
+ const frames = await this.extractFramesFromVideo({ frameCount, maxDuration });
246
+
247
+ const frameAnalyses = [];
248
+ const progressPerFrame = 60 / frameCount;
249
+
250
+ for (let i = 0; i < frames.length; i++) {
251
+ const frame = frames[i];
252
+ this.elements.loadingText.textContent = `Analyzing frame ${i + 1} of ${frameCount}...`;
253
+
254
+ const messages = [{
255
+ role: "user",
256
+ content: `Frame at ${frame.time}s: <image>Describe what's happening in this frame in detail.`,
257
+ }];
258
+
259
+ const prompt = this.processor.apply_chat_template(messages, {
260
+ add_generation_prompt: true,
261
+ });
262
+
263
+ const image = await this.loadImageFromSource(frame.blob);
264
+ const inputs = await this.processor(image, prompt, {
265
+ add_special_tokens: false,
266
+ });
267
+
268
+ const outputs = await this.model.generate({
269
+ ...inputs,
270
+ max_new_tokens: 256,
271
+ do_sample: false,
272
+ });
273
+
274
+ const decoded = this.processor.batch_decode(
275
+ outputs.slice(null, [inputs.input_ids.dims.at(-1), null]),
276
+ { skip_special_tokens: true },
277
+ );
278
+
279
+ frameAnalyses.push({
280
+ time: frame.time,
281
+ description: decoded[0],
282
+ });
283
+
284
+ this.updateProgress(30 + (i + 1) * progressPerFrame);
285
+ }
286
+
287
+ this.elements.loadingText.textContent = 'Generating summary...';
288
+
289
+ // Generate summary
290
+ const summaryPrompt = `Based on these frame descriptions from a video, provide a coherent summary of what happens in the video:\n\n${
291
+ frameAnalyses.map(f => `At ${f.time}s: ${f.description}`).join('\n')
292
+ }\n\nVideo Summary:`;
293
+
294
+ const messages = [{
295
+ role: "user",
296
+ content: summaryPrompt,
297
+ }];
298
+
299
+ const prompt = this.processor.apply_chat_template(messages, {
300
+ add_generation_prompt: true,
301
+ });
302
+
303
+ const firstFrameImage = await this.loadImageFromSource(frames[0].blob);
304
+ const inputs = await this.processor(firstFrameImage, prompt, {
305
+ add_special_tokens: false,
306
+ });
307
+
308
+ const outputs = await this.model.generate({
309
+ ...inputs,
310
+ max_new_tokens: 512,
311
+ do_sample: false,
312
+ });
313
+
314
+ const decoded = this.processor.batch_decode(
315
+ outputs.slice(null, [inputs.input_ids.dims.at(-1), null]),
316
+ { skip_special_tokens: true },
317
+ );
318
+
319
+ const summary = decoded[0];
320
+
321
+ this.updateProgress(100);
322
+ this.displayResults({ frames: frameAnalyses, summary });
323
+
324
+ } catch (error) {
325
+ console.error('Analysis error:', error);
326
+ this.showError(error.message);
327
+ } finally {
328
+ this.isProcessing = false;
329
+ this.elements.loadingOverlay.classList.remove('active');
330
+ this.elements.analyzeBtn.disabled = false;
331
+ }
332
+ }
333
+
334
+ displayResults(analysis) {
335
+ let html = `
336
+ <div class="summary-section">
337
+ <h3>Video Summary</h3>
338
+ <p>${analysis.summary || 'No summary available'}</p>
339
+ </div>
340
+ <div class="frames-section">
341
+ <h3>Frame-by-Frame Analysis</h3>
342
+ <div class="frame-list">
343
+ `;
344
+
345
+ analysis.frames.forEach(frame => {
346
+ html += `
347
+ <div class="frame-item">
348
+ <div class="frame-time">${frame.time}s</div>
349
+ <div class="frame-description">${frame.description}</div>
350
+ </div>
351
+ `;
352
  });
353
+
354
+ html += `
355
+ </div>
356
+ </div>
357
+ `;
358
+
359
+ this.elements.resultsContainer.innerHTML = html;
360
+ }
361
 
362
+ showError(message) {
363
+ this.elements.resultsContainer.innerHTML = `
364
+ <div class="error-message">
365
+ <svg width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
366
+ <circle cx="12" cy="12" r="10"></circle>
367
+ <line x1="12" y1="8" x2="12" y2="12"></line>
368
+ <line x1="12" y1="16" x2="12.01" y2="16"></line>
369
+ </svg>
370
+ <p>Error: ${message}</p>
371
+ </div>
372
+ `;
373
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
  }
375
+
376
+ // Initialize app when DOM is ready
377
+ document.addEventListener('DOMContentLoaded', () => {
378
+ new VideoCaptionApp();
379
+ });