Spaces:
Running
Running
Upload index.js with huggingface_hub
Browse files
index.js
CHANGED
@@ -1,76 +1,379 @@
|
|
1 |
-
import {
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
}
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
}
|
|
|
|
|
26 |
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
-
|
42 |
-
const
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
});
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
left: 100 * xmin + '%',
|
63 |
-
top: 100 * ymin + '%',
|
64 |
-
width: 100 * (xmax - xmin) + '%',
|
65 |
-
height: 100 * (ymax - ymin) + '%',
|
66 |
-
})
|
67 |
-
|
68 |
-
// Draw label
|
69 |
-
const labelElement = document.createElement('span');
|
70 |
-
labelElement.textContent = label;
|
71 |
-
labelElement.className = 'bounding-box-label';
|
72 |
-
labelElement.style.backgroundColor = color;
|
73 |
-
|
74 |
-
boxElement.appendChild(labelElement);
|
75 |
-
imageContainer.appendChild(boxElement);
|
76 |
}
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import {
|
2 |
+
AutoProcessor,
|
3 |
+
AutoModelForImageTextToText,
|
4 |
+
TextStreamer,
|
5 |
+
} from 'https://cdn.jsdelivr.net/npm/@huggingface/[email protected]';
|
6 |
+
|
7 |
+
class VideoCaptionApp {
|
8 |
+
constructor() {
|
9 |
+
this.videoFile = null;
|
10 |
+
this.videoUrl = null;
|
11 |
+
this.processor = null;
|
12 |
+
this.model = null;
|
13 |
+
this.useGPU = false;
|
14 |
+
this.isProcessing = false;
|
15 |
+
|
16 |
+
this.initializeElements();
|
17 |
+
this.bindEvents();
|
18 |
+
this.checkWebGPUSupport();
|
19 |
+
}
|
20 |
+
|
21 |
+
initializeElements() {
|
22 |
+
this.elements = {
|
23 |
+
videoPlayer: document.getElementById('videoPlayer'),
|
24 |
+
fileInput: document.getElementById('fileInput'),
|
25 |
+
uploadBtn: document.getElementById('uploadBtn'),
|
26 |
+
analyzeBtn: document.getElementById('analyzeBtn'),
|
27 |
+
captionPanel: document.getElementById('captionPanel'),
|
28 |
+
closePanel: document.getElementById('closePanel'),
|
29 |
+
loadingOverlay: document.getElementById('loadingOverlay'),
|
30 |
+
loadingText: document.getElementById('loadingText'),
|
31 |
+
progressFill: document.getElementById('progressFill'),
|
32 |
+
resultsContainer: document.getElementById('resultsContainer'),
|
33 |
+
frameCount: document.getElementById('frameCount'),
|
34 |
+
frameCountValue: document.getElementById('frameCountValue'),
|
35 |
+
maxDuration: document.getElementById('maxDuration'),
|
36 |
+
maxDurationValue: document.getElementById('maxDurationValue'),
|
37 |
+
toggleDevice: document.getElementById('toggleDevice'),
|
38 |
+
videoPlaceholder: document.querySelector('.video-placeholder')
|
39 |
+
};
|
40 |
+
}
|
41 |
+
|
42 |
+
bindEvents() {
|
43 |
+
this.elements.uploadBtn.addEventListener('click', () => {
|
44 |
+
this.elements.fileInput.click();
|
45 |
+
});
|
46 |
+
|
47 |
+
this.elements.fileInput.addEventListener('change', (e) => {
|
48 |
+
this.handleFileUpload(e);
|
49 |
+
});
|
50 |
+
|
51 |
+
this.elements.analyzeBtn.addEventListener('click', () => {
|
52 |
+
this.analyzeVideo();
|
53 |
+
});
|
54 |
+
|
55 |
+
this.elements.closePanel.addEventListener('click', () => {
|
56 |
+
this.elements.captionPanel.classList.remove('active');
|
57 |
+
});
|
58 |
+
|
59 |
+
this.elements.frameCount.addEventListener('input', (e) => {
|
60 |
+
this.elements.frameCountValue.textContent = e.target.value;
|
61 |
+
});
|
62 |
+
|
63 |
+
this.elements.maxDuration.addEventListener('input', (e) => {
|
64 |
+
this.elements.maxDurationValue.textContent = e.target.value;
|
65 |
+
});
|
66 |
+
|
67 |
+
this.elements.toggleDevice.addEventListener('click', () => {
|
68 |
+
this.toggleDevice();
|
69 |
+
});
|
70 |
+
|
71 |
+
// Drag and drop support
|
72 |
+
const app = document.getElementById('app');
|
73 |
+
app.addEventListener('dragover', (e) => {
|
74 |
+
e.preventDefault();
|
75 |
+
app.classList.add('dragging');
|
76 |
+
});
|
77 |
+
|
78 |
+
app.addEventListener('dragleave', () => {
|
79 |
+
app.classList.remove('dragging');
|
80 |
+
});
|
81 |
+
|
82 |
+
app.addEventListener('drop', (e) => {
|
83 |
+
e.preventDefault();
|
84 |
+
app.classList.remove('dragging');
|
85 |
+
const files = e.dataTransfer.files;
|
86 |
+
if (files.length > 0 && files[0].type.startsWith('video/')) {
|
87 |
+
this.loadVideo(files[0]);
|
88 |
+
}
|
89 |
+
});
|
90 |
+
}
|
91 |
+
|
92 |
+
async checkWebGPUSupport() {
|
93 |
+
if ('gpu' in navigator) {
|
94 |
+
try {
|
95 |
+
const adapter = await navigator.gpu.requestAdapter();
|
96 |
+
if (adapter) {
|
97 |
+
this.elements.toggleDevice.style.display = 'flex';
|
98 |
+
return;
|
99 |
+
}
|
100 |
+
} catch (e) {
|
101 |
+
console.log('WebGPU not supported:', e);
|
102 |
+
}
|
103 |
}
|
104 |
+
this.elements.toggleDevice.style.display = 'none';
|
105 |
+
}
|
106 |
|
107 |
+
toggleDevice() {
|
108 |
+
this.useGPU = !this.useGPU;
|
109 |
+
const deviceLabel = this.elements.toggleDevice.querySelector('.device-label');
|
110 |
+
deviceLabel.textContent = this.useGPU ? 'GPU' : 'CPU';
|
111 |
+
this.elements.toggleDevice.classList.toggle('gpu-active', this.useGPU);
|
112 |
+
|
113 |
+
// Reset model to force reload with new device
|
114 |
+
this.model = null;
|
115 |
+
this.processor = null;
|
116 |
+
}
|
117 |
|
118 |
+
handleFileUpload(event) {
|
119 |
+
const file = event.target.files[0];
|
120 |
+
if (file && file.type.startsWith('video/')) {
|
121 |
+
this.loadVideo(file);
|
122 |
+
}
|
123 |
+
}
|
124 |
|
125 |
+
loadVideo(file) {
|
126 |
+
this.videoFile = file;
|
127 |
+
this.videoUrl = URL.createObjectURL(file);
|
128 |
+
|
129 |
+
this.elements.videoPlayer.src = this.videoUrl;
|
130 |
+
this.elements.videoPlayer.style.display = 'block';
|
131 |
+
this.elements.videoPlaceholder.style.display = 'none';
|
132 |
+
this.elements.captionPanel.classList.add('active');
|
133 |
+
|
134 |
+
this.elements.videoPlayer.addEventListener('loadedmetadata', () => {
|
135 |
+
console.log('Video loaded:', {
|
136 |
+
duration: this.elements.videoPlayer.duration,
|
137 |
+
width: this.elements.videoPlayer.videoWidth,
|
138 |
+
height: this.elements.videoPlayer.videoHeight
|
139 |
+
});
|
140 |
+
});
|
141 |
+
}
|
142 |
|
143 |
+
async extractFramesFromVideo(options = {}) {
|
144 |
+
const {
|
145 |
+
frameCount = 8,
|
146 |
+
maxDuration = 10,
|
147 |
+
} = options;
|
148 |
|
149 |
+
return new Promise((resolve, reject) => {
|
150 |
+
const video = document.createElement('video');
|
151 |
+
video.crossOrigin = 'anonymous';
|
152 |
+
video.muted = true;
|
153 |
+
|
154 |
+
video.addEventListener('loadedmetadata', async () => {
|
155 |
+
const duration = Math.min(video.duration, maxDuration);
|
156 |
+
const interval = duration / frameCount;
|
157 |
+
const frames = [];
|
158 |
+
|
159 |
+
const canvas = document.createElement('canvas');
|
160 |
+
const ctx = canvas.getContext('2d');
|
161 |
+
|
162 |
+
canvas.width = Math.min(video.videoWidth, 640);
|
163 |
+
canvas.height = Math.min(video.videoHeight, 480);
|
164 |
+
|
165 |
+
for (let i = 0; i < frameCount; i++) {
|
166 |
+
const time = i * interval;
|
167 |
+
video.currentTime = time;
|
168 |
+
|
169 |
+
await new Promise(resolve => {
|
170 |
+
video.addEventListener('seeked', () => {
|
171 |
+
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
|
172 |
+
|
173 |
+
canvas.toBlob((blob) => {
|
174 |
+
frames.push({
|
175 |
+
time: time.toFixed(2),
|
176 |
+
blob
|
177 |
+
});
|
178 |
+
this.updateProgress((i + 1) / frameCount * 20);
|
179 |
+
resolve();
|
180 |
+
}, 'image/jpeg', 0.8);
|
181 |
+
}, { once: true });
|
182 |
+
});
|
183 |
+
}
|
184 |
+
|
185 |
+
resolve(frames);
|
186 |
+
});
|
187 |
+
|
188 |
+
video.addEventListener('error', (e) => {
|
189 |
+
reject(new Error(`Failed to load video: ${e.message}`));
|
190 |
+
});
|
191 |
+
|
192 |
+
video.src = this.videoUrl;
|
193 |
+
video.load();
|
194 |
+
});
|
195 |
+
}
|
196 |
|
197 |
+
async loadImageFromSource(blob) {
|
198 |
+
const bitmap = await createImageBitmap(blob);
|
199 |
+
return bitmap;
|
200 |
+
}
|
201 |
+
|
202 |
+
updateProgress(percentage) {
|
203 |
+
this.elements.progressFill.style.width = `${percentage}%`;
|
204 |
+
}
|
205 |
+
|
206 |
+
async initializeModel() {
|
207 |
+
if (!this.model || !this.processor) {
|
208 |
+
this.elements.loadingText.textContent = 'Loading AI model...';
|
209 |
+
|
210 |
+
const model_id = "onnx-community/FastVLM-0.5B-ONNX";
|
211 |
+
|
212 |
+
this.processor = await AutoProcessor.from_pretrained(model_id);
|
213 |
+
|
214 |
+
const deviceConfig = this.useGPU ? { device: 'webgpu' } : {};
|
215 |
+
|
216 |
+
this.model = await AutoModelForImageTextToText.from_pretrained(model_id, {
|
217 |
+
...deviceConfig,
|
218 |
+
dtype: {
|
219 |
+
embed_tokens: "fp16",
|
220 |
+
vision_encoder: "q4",
|
221 |
+
decoder_model_merged: "q4",
|
222 |
+
},
|
223 |
+
});
|
224 |
+
|
225 |
+
this.updateProgress(30);
|
226 |
+
}
|
227 |
+
}
|
228 |
+
|
229 |
+
async analyzeVideo() {
|
230 |
+
if (this.isProcessing || !this.videoUrl) return;
|
231 |
+
|
232 |
+
this.isProcessing = true;
|
233 |
+
this.elements.loadingOverlay.classList.add('active');
|
234 |
+
this.elements.analyzeBtn.disabled = true;
|
235 |
+
this.elements.resultsContainer.innerHTML = '';
|
236 |
+
this.updateProgress(0);
|
237 |
+
|
238 |
+
try {
|
239 |
+
await this.initializeModel();
|
240 |
+
|
241 |
+
const frameCount = parseInt(this.elements.frameCount.value);
|
242 |
+
const maxDuration = parseInt(this.elements.maxDuration.value);
|
243 |
+
|
244 |
+
this.elements.loadingText.textContent = 'Extracting frames...';
|
245 |
+
const frames = await this.extractFramesFromVideo({ frameCount, maxDuration });
|
246 |
+
|
247 |
+
const frameAnalyses = [];
|
248 |
+
const progressPerFrame = 60 / frameCount;
|
249 |
+
|
250 |
+
for (let i = 0; i < frames.length; i++) {
|
251 |
+
const frame = frames[i];
|
252 |
+
this.elements.loadingText.textContent = `Analyzing frame ${i + 1} of ${frameCount}...`;
|
253 |
+
|
254 |
+
const messages = [{
|
255 |
+
role: "user",
|
256 |
+
content: `Frame at ${frame.time}s: <image>Describe what's happening in this frame in detail.`,
|
257 |
+
}];
|
258 |
+
|
259 |
+
const prompt = this.processor.apply_chat_template(messages, {
|
260 |
+
add_generation_prompt: true,
|
261 |
+
});
|
262 |
+
|
263 |
+
const image = await this.loadImageFromSource(frame.blob);
|
264 |
+
const inputs = await this.processor(image, prompt, {
|
265 |
+
add_special_tokens: false,
|
266 |
+
});
|
267 |
+
|
268 |
+
const outputs = await this.model.generate({
|
269 |
+
...inputs,
|
270 |
+
max_new_tokens: 256,
|
271 |
+
do_sample: false,
|
272 |
+
});
|
273 |
+
|
274 |
+
const decoded = this.processor.batch_decode(
|
275 |
+
outputs.slice(null, [inputs.input_ids.dims.at(-1), null]),
|
276 |
+
{ skip_special_tokens: true },
|
277 |
+
);
|
278 |
+
|
279 |
+
frameAnalyses.push({
|
280 |
+
time: frame.time,
|
281 |
+
description: decoded[0],
|
282 |
+
});
|
283 |
+
|
284 |
+
this.updateProgress(30 + (i + 1) * progressPerFrame);
|
285 |
+
}
|
286 |
+
|
287 |
+
this.elements.loadingText.textContent = 'Generating summary...';
|
288 |
+
|
289 |
+
// Generate summary
|
290 |
+
const summaryPrompt = `Based on these frame descriptions from a video, provide a coherent summary of what happens in the video:\n\n${
|
291 |
+
frameAnalyses.map(f => `At ${f.time}s: ${f.description}`).join('\n')
|
292 |
+
}\n\nVideo Summary:`;
|
293 |
+
|
294 |
+
const messages = [{
|
295 |
+
role: "user",
|
296 |
+
content: summaryPrompt,
|
297 |
+
}];
|
298 |
+
|
299 |
+
const prompt = this.processor.apply_chat_template(messages, {
|
300 |
+
add_generation_prompt: true,
|
301 |
+
});
|
302 |
+
|
303 |
+
const firstFrameImage = await this.loadImageFromSource(frames[0].blob);
|
304 |
+
const inputs = await this.processor(firstFrameImage, prompt, {
|
305 |
+
add_special_tokens: false,
|
306 |
+
});
|
307 |
+
|
308 |
+
const outputs = await this.model.generate({
|
309 |
+
...inputs,
|
310 |
+
max_new_tokens: 512,
|
311 |
+
do_sample: false,
|
312 |
+
});
|
313 |
+
|
314 |
+
const decoded = this.processor.batch_decode(
|
315 |
+
outputs.slice(null, [inputs.input_ids.dims.at(-1), null]),
|
316 |
+
{ skip_special_tokens: true },
|
317 |
+
);
|
318 |
+
|
319 |
+
const summary = decoded[0];
|
320 |
+
|
321 |
+
this.updateProgress(100);
|
322 |
+
this.displayResults({ frames: frameAnalyses, summary });
|
323 |
+
|
324 |
+
} catch (error) {
|
325 |
+
console.error('Analysis error:', error);
|
326 |
+
this.showError(error.message);
|
327 |
+
} finally {
|
328 |
+
this.isProcessing = false;
|
329 |
+
this.elements.loadingOverlay.classList.remove('active');
|
330 |
+
this.elements.analyzeBtn.disabled = false;
|
331 |
+
}
|
332 |
+
}
|
333 |
+
|
334 |
+
displayResults(analysis) {
|
335 |
+
let html = `
|
336 |
+
<div class="summary-section">
|
337 |
+
<h3>Video Summary</h3>
|
338 |
+
<p>${analysis.summary || 'No summary available'}</p>
|
339 |
+
</div>
|
340 |
+
<div class="frames-section">
|
341 |
+
<h3>Frame-by-Frame Analysis</h3>
|
342 |
+
<div class="frame-list">
|
343 |
+
`;
|
344 |
+
|
345 |
+
analysis.frames.forEach(frame => {
|
346 |
+
html += `
|
347 |
+
<div class="frame-item">
|
348 |
+
<div class="frame-time">${frame.time}s</div>
|
349 |
+
<div class="frame-description">${frame.description}</div>
|
350 |
+
</div>
|
351 |
+
`;
|
352 |
});
|
353 |
+
|
354 |
+
html += `
|
355 |
+
</div>
|
356 |
+
</div>
|
357 |
+
`;
|
358 |
+
|
359 |
+
this.elements.resultsContainer.innerHTML = html;
|
360 |
+
}
|
361 |
|
362 |
+
showError(message) {
|
363 |
+
this.elements.resultsContainer.innerHTML = `
|
364 |
+
<div class="error-message">
|
365 |
+
<svg width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
366 |
+
<circle cx="12" cy="12" r="10"></circle>
|
367 |
+
<line x1="12" y1="8" x2="12" y2="12"></line>
|
368 |
+
<line x1="12" y1="16" x2="12.01" y2="16"></line>
|
369 |
+
</svg>
|
370 |
+
<p>Error: ${message}</p>
|
371 |
+
</div>
|
372 |
+
`;
|
373 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
374 |
}
|
375 |
+
|
376 |
+
// Initialize app when DOM is ready
|
377 |
+
document.addEventListener('DOMContentLoaded', () => {
|
378 |
+
new VideoCaptionApp();
|
379 |
+
});
|