Spaces:
Running
Running
Upload index.js with huggingface_hub
Browse files
index.js
CHANGED
@@ -1,76 +1,364 @@
|
|
1 |
-
import {
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
}
|
|
|
26 |
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
-
|
42 |
-
const
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
});
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
const
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
boxElement.appendChild(labelElement);
|
75 |
-
imageContainer.appendChild(boxElement);
|
76 |
}
|
|
|
|
|
|
|
|
|
|
1 |
+
import {
|
2 |
+
AutoProcessor,
|
3 |
+
AutoModelForImageTextToText,
|
4 |
+
load_image,
|
5 |
+
TextStreamer,
|
6 |
+
} from "https://cdn.jsdelivr.net/npm/@huggingface/[email protected]";
|
7 |
+
|
8 |
+
class VideoCaptionApp {
|
9 |
+
constructor() {
|
10 |
+
this.videoFile = null;
|
11 |
+
this.model = null;
|
12 |
+
this.processor = null;
|
13 |
+
this.isProcessing = false;
|
14 |
+
this.initializeElements();
|
15 |
+
this.attachEventListeners();
|
16 |
+
this.checkWebGPUSupport();
|
17 |
+
}
|
18 |
+
|
19 |
+
initializeElements() {
|
20 |
+
this.elements = {
|
21 |
+
videoPlayer: document.getElementById('videoPlayer'),
|
22 |
+
videoInput: document.getElementById('videoInput'),
|
23 |
+
uploadArea: document.getElementById('uploadArea'),
|
24 |
+
processBtn: document.getElementById('processBtn'),
|
25 |
+
frameCount: document.getElementById('frameCount'),
|
26 |
+
deviceSelect: document.getElementById('deviceSelect'),
|
27 |
+
results: document.getElementById('results'),
|
28 |
+
frameCaptions: document.getElementById('frameCaptions'),
|
29 |
+
summaryText: document.getElementById('summaryText'),
|
30 |
+
progressOverlay: document.getElementById('progressOverlay'),
|
31 |
+
progressCircle: document.getElementById('progressCircle'),
|
32 |
+
progressText: document.getElementById('progressText'),
|
33 |
+
progressStatus: document.getElementById('progressStatus'),
|
34 |
+
controls: document.getElementById('controls'),
|
35 |
+
copyBtn: document.getElementById('copyBtn'),
|
36 |
+
finalCaption: document.getElementById('finalCaption')
|
37 |
+
};
|
38 |
+
}
|
39 |
+
|
40 |
+
attachEventListeners() {
|
41 |
+
this.elements.uploadArea.addEventListener('click', () => {
|
42 |
+
if (!this.isProcessing) {
|
43 |
+
this.elements.videoInput.click();
|
44 |
+
}
|
45 |
+
});
|
46 |
+
|
47 |
+
this.elements.uploadArea.addEventListener('dragover', (e) => {
|
48 |
+
e.preventDefault();
|
49 |
+
if (!this.isProcessing) {
|
50 |
+
this.elements.uploadArea.classList.add('drag-over');
|
51 |
+
}
|
52 |
+
});
|
53 |
+
|
54 |
+
this.elements.uploadArea.addEventListener('dragleave', () => {
|
55 |
+
this.elements.uploadArea.classList.remove('drag-over');
|
56 |
+
});
|
57 |
+
|
58 |
+
this.elements.uploadArea.addEventListener('drop', (e) => {
|
59 |
+
e.preventDefault();
|
60 |
+
this.elements.uploadArea.classList.remove('drag-over');
|
61 |
+
if (!this.isProcessing && e.dataTransfer.files.length > 0) {
|
62 |
+
const file = e.dataTransfer.files[0];
|
63 |
+
if (file.type.startsWith('video/')) {
|
64 |
+
this.handleVideoUpload(file);
|
65 |
+
}
|
66 |
+
}
|
67 |
+
});
|
68 |
+
|
69 |
+
this.elements.videoInput.addEventListener('change', (e) => {
|
70 |
+
if (e.target.files.length > 0) {
|
71 |
+
this.handleVideoUpload(e.target.files[0]);
|
72 |
+
}
|
73 |
+
});
|
74 |
+
|
75 |
+
this.elements.processBtn.addEventListener('click', () => {
|
76 |
+
if (!this.isProcessing && this.videoFile) {
|
77 |
+
this.processVideo();
|
78 |
+
}
|
79 |
+
});
|
80 |
+
|
81 |
+
this.elements.copyBtn.addEventListener('click', () => {
|
82 |
+
this.copyResults();
|
83 |
+
});
|
84 |
+
}
|
85 |
+
|
86 |
+
async checkWebGPUSupport() {
|
87 |
+
if (!navigator.gpu) {
|
88 |
+
this.elements.deviceSelect.querySelector('option[value="webgpu"]').disabled = true;
|
89 |
+
this.elements.deviceSelect.value = 'cpu';
|
90 |
}
|
91 |
+
}
|
92 |
|
93 |
+
handleVideoUpload(file) {
|
94 |
+
this.videoFile = file;
|
95 |
+
const videoURL = URL.createObjectURL(file);
|
96 |
+
this.elements.videoPlayer.src = videoURL;
|
97 |
+
this.elements.uploadArea.style.display = 'none';
|
98 |
+
this.elements.controls.style.display = 'block';
|
99 |
+
this.elements.results.style.display = 'none';
|
100 |
+
}
|
101 |
|
102 |
+
updateProgress(percent, status) {
|
103 |
+
const circumference = 2 * Math.PI * 45;
|
104 |
+
const offset = circumference - (percent / 100) * circumference;
|
105 |
+
this.elements.progressCircle.style.strokeDasharray = `${circumference} ${circumference}`;
|
106 |
+
this.elements.progressCircle.style.strokeDashoffset = offset;
|
107 |
+
this.elements.progressText.textContent = `${Math.round(percent)}%`;
|
108 |
+
this.elements.progressStatus.textContent = status;
|
109 |
+
}
|
110 |
|
111 |
+
async extractFramesFromVideo(videoUrl, numFrames = 8) {
|
112 |
+
return new Promise((resolve, reject) => {
|
113 |
+
const video = document.createElement('video');
|
114 |
+
video.crossOrigin = 'anonymous';
|
115 |
+
video.muted = true;
|
116 |
+
|
117 |
+
const frames = [];
|
118 |
+
let captureCount = 0;
|
119 |
+
|
120 |
+
video.addEventListener('loadedmetadata', async () => {
|
121 |
+
const duration = video.duration;
|
122 |
+
const interval = duration / numFrames;
|
123 |
+
|
124 |
+
for (let i = 0; i < numFrames; i++) {
|
125 |
+
video.currentTime = i * interval;
|
126 |
+
|
127 |
+
await new Promise(seekResolve => {
|
128 |
+
video.addEventListener('seeked', async () => {
|
129 |
+
const canvas = document.createElement('canvas');
|
130 |
+
canvas.width = Math.min(video.videoWidth, 1280);
|
131 |
+
canvas.height = Math.min(video.videoHeight, 720);
|
132 |
+
const ctx = canvas.getContext('2d');
|
133 |
+
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
|
134 |
+
|
135 |
+
const blob = await new Promise(blobResolve => {
|
136 |
+
canvas.toBlob(blobResolve, 'image/jpeg', 0.85);
|
137 |
+
});
|
138 |
+
|
139 |
+
frames.push({
|
140 |
+
blob,
|
141 |
+
timestamp: i * interval
|
142 |
+
});
|
143 |
+
captureCount++;
|
144 |
+
|
145 |
+
this.updateProgress((captureCount / numFrames) * 20, `Extracting frame ${captureCount}/${numFrames}...`);
|
146 |
+
|
147 |
+
if (captureCount === numFrames) {
|
148 |
+
resolve(frames);
|
149 |
+
}
|
150 |
+
|
151 |
+
seekResolve();
|
152 |
+
}, { once: true });
|
153 |
+
});
|
154 |
+
}
|
155 |
+
});
|
156 |
+
|
157 |
+
video.addEventListener('error', reject);
|
158 |
+
video.src = videoUrl;
|
159 |
+
video.load();
|
160 |
+
});
|
161 |
+
}
|
162 |
|
163 |
+
async initializeModel() {
|
164 |
+
const device = this.elements.deviceSelect.value;
|
165 |
+
const model_id = "onnx-community/FastVLM-0.5B-ONNX";
|
166 |
+
|
167 |
+
this.updateProgress(25, 'Loading AI model...');
|
168 |
+
|
169 |
+
try {
|
170 |
+
this.processor = await AutoProcessor.from_pretrained(model_id);
|
171 |
+
|
172 |
+
this.updateProgress(50, 'Initializing model...');
|
173 |
+
|
174 |
+
const modelOptions = {
|
175 |
+
dtype: {
|
176 |
+
embed_tokens: "fp16",
|
177 |
+
vision_encoder: "q4",
|
178 |
+
decoder_model_merged: "q4",
|
179 |
+
}
|
180 |
+
};
|
181 |
+
|
182 |
+
if (device === 'webgpu') {
|
183 |
+
modelOptions.device = 'webgpu';
|
184 |
+
}
|
185 |
+
|
186 |
+
this.model = await AutoModelForImageTextToText.from_pretrained(model_id, modelOptions);
|
187 |
+
|
188 |
+
this.updateProgress(60, 'Model ready');
|
189 |
+
} catch (error) {
|
190 |
+
console.error('Model initialization error:', error);
|
191 |
+
throw error;
|
192 |
+
}
|
193 |
+
}
|
194 |
+
|
195 |
+
async processVideo() {
|
196 |
+
this.isProcessing = true;
|
197 |
+
this.elements.processBtn.classList.add('loading');
|
198 |
+
this.elements.progressOverlay.classList.add('active');
|
199 |
+
this.elements.results.style.display = 'none';
|
200 |
+
this.elements.frameCaptions.innerHTML = '';
|
201 |
+
|
202 |
+
try {
|
203 |
+
const videoURL = URL.createObjectURL(this.videoFile);
|
204 |
+
const numFrames = parseInt(this.elements.frameCount.value);
|
205 |
+
|
206 |
+
this.updateProgress(0, 'Starting...');
|
207 |
+
|
208 |
+
const frames = await this.extractFramesFromVideo(videoURL, numFrames);
|
209 |
+
|
210 |
+
if (!this.model) {
|
211 |
+
await this.initializeModel();
|
212 |
+
}
|
213 |
+
|
214 |
+
const allCaptions = [];
|
215 |
+
const totalSteps = frames.length;
|
216 |
+
|
217 |
+
for (let i = 0; i < frames.length; i++) {
|
218 |
+
const progress = 60 + (i / totalSteps) * 30;
|
219 |
+
this.updateProgress(progress, `Analyzing frame ${i + 1}/${totalSteps}...`);
|
220 |
+
|
221 |
+
const frameUrl = URL.createObjectURL(frames[i].blob);
|
222 |
+
const image = await load_image(frameUrl);
|
223 |
+
|
224 |
+
const messages = [
|
225 |
+
{
|
226 |
+
role: "user",
|
227 |
+
content: `<image>This is frame ${i + 1} of ${numFrames} from a video at ${frames[i].timestamp.toFixed(1)}s. Describe what's happening in this frame, focusing on actions, objects, and any notable changes.`,
|
228 |
+
},
|
229 |
+
];
|
230 |
+
|
231 |
+
const prompt = this.processor.apply_chat_template(messages, {
|
232 |
+
add_generation_prompt: true,
|
233 |
+
});
|
234 |
+
|
235 |
+
const inputs = await this.processor(image, prompt, {
|
236 |
+
add_special_tokens: false,
|
237 |
+
});
|
238 |
+
|
239 |
+
let captionText = '';
|
240 |
+
const streamer = new TextStreamer(this.processor.tokenizer, {
|
241 |
+
skip_prompt: true,
|
242 |
+
skip_special_tokens: false,
|
243 |
+
callback_function: (text) => {
|
244 |
+
captionText += text;
|
245 |
+
}
|
246 |
+
});
|
247 |
+
|
248 |
+
const outputs = await this.model.generate({
|
249 |
+
...inputs,
|
250 |
+
max_new_tokens: 256,
|
251 |
+
do_sample: false,
|
252 |
+
streamer: streamer,
|
253 |
+
});
|
254 |
+
|
255 |
+
allCaptions.push({
|
256 |
+
frame: i + 1,
|
257 |
+
timestamp: frames[i].timestamp,
|
258 |
+
caption: captionText.trim()
|
259 |
+
});
|
260 |
+
|
261 |
+
this.displayFrameCaption(allCaptions[allCaptions.length - 1]);
|
262 |
+
|
263 |
+
URL.revokeObjectURL(frameUrl);
|
264 |
+
}
|
265 |
+
|
266 |
+
this.updateProgress(95, 'Generating video summary...');
|
267 |
+
|
268 |
+
await this.generateVideoSummary(frames[frames.length - 1], allCaptions);
|
269 |
+
|
270 |
+
this.updateProgress(100, 'Complete!');
|
271 |
+
|
272 |
+
setTimeout(() => {
|
273 |
+
this.elements.progressOverlay.classList.remove('active');
|
274 |
+
this.elements.results.style.display = 'block';
|
275 |
+
}, 500);
|
276 |
+
|
277 |
+
} catch (error) {
|
278 |
+
console.error('Processing error:', error);
|
279 |
+
alert('An error occurred while processing the video. Please try again.');
|
280 |
+
} finally {
|
281 |
+
this.isProcessing = false;
|
282 |
+
this.elements.processBtn.classList.remove('loading');
|
283 |
+
}
|
284 |
+
}
|
285 |
|
286 |
+
displayFrameCaption(captionData) {
|
287 |
+
const captionElement = document.createElement('div');
|
288 |
+
captionElement.className = 'frame-caption-item';
|
289 |
+
captionElement.innerHTML = `
|
290 |
+
<div class="frame-header">
|
291 |
+
<span class="frame-number">Frame ${captionData.frame}</span>
|
292 |
+
<span class="frame-time">${this.formatTime(captionData.timestamp)}</span>
|
293 |
+
</div>
|
294 |
+
<p class="frame-text">${captionData.caption}</p>
|
295 |
+
`;
|
296 |
+
this.elements.frameCaptions.appendChild(captionElement);
|
297 |
+
}
|
298 |
|
299 |
+
async generateVideoSummary(lastFrame, allCaptions) {
|
300 |
+
const frameUrl = URL.createObjectURL(lastFrame.blob);
|
301 |
+
const image = await load_image(frameUrl);
|
302 |
+
|
303 |
+
const summaryPrompt = this.processor.apply_chat_template([
|
304 |
+
{
|
305 |
+
role: "user",
|
306 |
+
content: `<image>Based on this frame and knowing that the video shows: ${
|
307 |
+
allCaptions.map(fc => fc.caption).join('; ')
|
308 |
+
}, provide a comprehensive caption for the entire video.`
|
309 |
+
}
|
310 |
+
], { add_generation_prompt: true });
|
311 |
+
|
312 |
+
const summaryInputs = await this.processor(image, summaryPrompt, {
|
313 |
+
add_special_tokens: false,
|
314 |
});
|
315 |
+
|
316 |
+
let summaryText = '';
|
317 |
+
const streamer = new TextStreamer(this.processor.tokenizer, {
|
318 |
+
skip_prompt: true,
|
319 |
+
skip_special_tokens: false,
|
320 |
+
callback_function: (text) => {
|
321 |
+
summaryText += text;
|
322 |
+
this.elements.summaryText.textContent = summaryText;
|
323 |
+
}
|
324 |
+
});
|
325 |
+
|
326 |
+
await this.model.generate({
|
327 |
+
...summaryInputs,
|
328 |
+
max_new_tokens: 512,
|
329 |
+
do_sample: false,
|
330 |
+
streamer: streamer,
|
331 |
+
});
|
332 |
+
|
333 |
+
URL.revokeObjectURL(frameUrl);
|
334 |
+
}
|
335 |
|
336 |
+
formatTime(seconds) {
|
337 |
+
const mins = Math.floor(seconds / 60);
|
338 |
+
const secs = Math.floor(seconds % 60);
|
339 |
+
return `${mins}:${secs.toString().padStart(2, '0')}`;
|
340 |
+
}
|
341 |
+
|
342 |
+
async copyResults() {
|
343 |
+
const frameCaptions = Array.from(this.elements.frameCaptions.querySelectorAll('.frame-caption-item'))
|
344 |
+
.map(el => el.querySelector('.frame-text').textContent)
|
345 |
+
.join('\n\n');
|
346 |
+
|
347 |
+
const summary = this.elements.summaryText.textContent;
|
348 |
+
const fullText = `Frame Captions:\n${frameCaptions}\n\nVideo Summary:\n${summary}`;
|
349 |
+
|
350 |
+
try {
|
351 |
+
await navigator.clipboard.writeText(fullText);
|
352 |
+
this.elements.copyBtn.classList.add('copied');
|
353 |
+
setTimeout(() => {
|
354 |
+
this.elements.copyBtn.classList.remove('copied');
|
355 |
+
}, 2000);
|
356 |
+
} catch (err) {
|
357 |
+
console.error('Failed to copy:', err);
|
358 |
+
}
|
359 |
+
}
|
|
|
|
|
360 |
}
|
361 |
+
|
362 |
+
document.addEventListener('DOMContentLoaded', () => {
|
363 |
+
new VideoCaptionApp();
|
364 |
+
});
|