akhaliq HF Staff commited on
Commit
5276df5
·
verified ·
1 Parent(s): fa49b89

Upload index.js with huggingface_hub

Browse files
Files changed (1) hide show
  1. index.js +354 -66
index.js CHANGED
@@ -1,76 +1,364 @@
1
- import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/[email protected]';
2
-
3
- // Reference the elements that we will need
4
- const status = document.getElementById('status');
5
- const fileUpload = document.getElementById('upload');
6
- const imageContainer = document.getElementById('container');
7
- const example = document.getElementById('example');
8
-
9
- const EXAMPLE_URL = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/city-streets.jpg';
10
-
11
- // Create a new object detection pipeline
12
- status.textContent = 'Loading model...';
13
- const detector = await pipeline('object-detection', 'Xenova/detr-resnet-50');
14
- status.textContent = 'Ready';
15
-
16
- example.addEventListener('click', (e) => {
17
- e.preventDefault();
18
- detect(EXAMPLE_URL);
19
- });
20
-
21
- fileUpload.addEventListener('change', function (e) {
22
- const file = e.target.files[0];
23
- if (!file) {
24
- return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  }
 
26
 
27
- const reader = new FileReader();
 
 
 
 
 
 
 
28
 
29
- // Set up a callback when the file is loaded
30
- reader.onload = e2 => detect(e2.target.result);
 
 
 
 
 
 
31
 
32
- reader.readAsDataURL(file);
33
- });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- // Detect objects in the image
37
- async function detect(img) {
38
- imageContainer.innerHTML = '';
39
- imageContainer.style.backgroundImage = `url(${img})`;
 
 
 
 
 
 
 
 
40
 
41
- status.textContent = 'Analysing...';
42
- const output = await detector(img, {
43
- threshold: 0.5,
44
- percentage: true,
 
 
 
 
 
 
 
 
 
 
 
45
  });
46
- status.textContent = '';
47
- output.forEach(renderBox);
48
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- // Render a bounding box and label on the image
51
- function renderBox({ box, label }) {
52
- const { xmax, xmin, ymax, ymin } = box;
53
-
54
- // Generate a random color for the box
55
- const color = '#' + Math.floor(Math.random() * 0xFFFFFF).toString(16).padStart(6, 0);
56
-
57
- // Draw the box
58
- const boxElement = document.createElement('div');
59
- boxElement.className = 'bounding-box';
60
- Object.assign(boxElement.style, {
61
- borderColor: color,
62
- left: 100 * xmin + '%',
63
- top: 100 * ymin + '%',
64
- width: 100 * (xmax - xmin) + '%',
65
- height: 100 * (ymax - ymin) + '%',
66
- })
67
-
68
- // Draw label
69
- const labelElement = document.createElement('span');
70
- labelElement.textContent = label;
71
- labelElement.className = 'bounding-box-label';
72
- labelElement.style.backgroundColor = color;
73
-
74
- boxElement.appendChild(labelElement);
75
- imageContainer.appendChild(boxElement);
76
  }
 
 
 
 
 
1
+ import {
2
+ AutoProcessor,
3
+ AutoModelForImageTextToText,
4
+ load_image,
5
+ TextStreamer,
6
+ } from "https://cdn.jsdelivr.net/npm/@huggingface/[email protected]";
7
+
8
+ class VideoCaptionApp {
9
+ constructor() {
10
+ this.videoFile = null;
11
+ this.model = null;
12
+ this.processor = null;
13
+ this.isProcessing = false;
14
+ this.initializeElements();
15
+ this.attachEventListeners();
16
+ this.checkWebGPUSupport();
17
+ }
18
+
19
+ initializeElements() {
20
+ this.elements = {
21
+ videoPlayer: document.getElementById('videoPlayer'),
22
+ videoInput: document.getElementById('videoInput'),
23
+ uploadArea: document.getElementById('uploadArea'),
24
+ processBtn: document.getElementById('processBtn'),
25
+ frameCount: document.getElementById('frameCount'),
26
+ deviceSelect: document.getElementById('deviceSelect'),
27
+ results: document.getElementById('results'),
28
+ frameCaptions: document.getElementById('frameCaptions'),
29
+ summaryText: document.getElementById('summaryText'),
30
+ progressOverlay: document.getElementById('progressOverlay'),
31
+ progressCircle: document.getElementById('progressCircle'),
32
+ progressText: document.getElementById('progressText'),
33
+ progressStatus: document.getElementById('progressStatus'),
34
+ controls: document.getElementById('controls'),
35
+ copyBtn: document.getElementById('copyBtn'),
36
+ finalCaption: document.getElementById('finalCaption')
37
+ };
38
+ }
39
+
40
+ attachEventListeners() {
41
+ this.elements.uploadArea.addEventListener('click', () => {
42
+ if (!this.isProcessing) {
43
+ this.elements.videoInput.click();
44
+ }
45
+ });
46
+
47
+ this.elements.uploadArea.addEventListener('dragover', (e) => {
48
+ e.preventDefault();
49
+ if (!this.isProcessing) {
50
+ this.elements.uploadArea.classList.add('drag-over');
51
+ }
52
+ });
53
+
54
+ this.elements.uploadArea.addEventListener('dragleave', () => {
55
+ this.elements.uploadArea.classList.remove('drag-over');
56
+ });
57
+
58
+ this.elements.uploadArea.addEventListener('drop', (e) => {
59
+ e.preventDefault();
60
+ this.elements.uploadArea.classList.remove('drag-over');
61
+ if (!this.isProcessing && e.dataTransfer.files.length > 0) {
62
+ const file = e.dataTransfer.files[0];
63
+ if (file.type.startsWith('video/')) {
64
+ this.handleVideoUpload(file);
65
+ }
66
+ }
67
+ });
68
+
69
+ this.elements.videoInput.addEventListener('change', (e) => {
70
+ if (e.target.files.length > 0) {
71
+ this.handleVideoUpload(e.target.files[0]);
72
+ }
73
+ });
74
+
75
+ this.elements.processBtn.addEventListener('click', () => {
76
+ if (!this.isProcessing && this.videoFile) {
77
+ this.processVideo();
78
+ }
79
+ });
80
+
81
+ this.elements.copyBtn.addEventListener('click', () => {
82
+ this.copyResults();
83
+ });
84
+ }
85
+
86
+ async checkWebGPUSupport() {
87
+ if (!navigator.gpu) {
88
+ this.elements.deviceSelect.querySelector('option[value="webgpu"]').disabled = true;
89
+ this.elements.deviceSelect.value = 'cpu';
90
  }
91
+ }
92
 
93
+ handleVideoUpload(file) {
94
+ this.videoFile = file;
95
+ const videoURL = URL.createObjectURL(file);
96
+ this.elements.videoPlayer.src = videoURL;
97
+ this.elements.uploadArea.style.display = 'none';
98
+ this.elements.controls.style.display = 'block';
99
+ this.elements.results.style.display = 'none';
100
+ }
101
 
102
+ updateProgress(percent, status) {
103
+ const circumference = 2 * Math.PI * 45;
104
+ const offset = circumference - (percent / 100) * circumference;
105
+ this.elements.progressCircle.style.strokeDasharray = `${circumference} ${circumference}`;
106
+ this.elements.progressCircle.style.strokeDashoffset = offset;
107
+ this.elements.progressText.textContent = `${Math.round(percent)}%`;
108
+ this.elements.progressStatus.textContent = status;
109
+ }
110
 
111
+ async extractFramesFromVideo(videoUrl, numFrames = 8) {
112
+ return new Promise((resolve, reject) => {
113
+ const video = document.createElement('video');
114
+ video.crossOrigin = 'anonymous';
115
+ video.muted = true;
116
+
117
+ const frames = [];
118
+ let captureCount = 0;
119
+
120
+ video.addEventListener('loadedmetadata', async () => {
121
+ const duration = video.duration;
122
+ const interval = duration / numFrames;
123
+
124
+ for (let i = 0; i < numFrames; i++) {
125
+ video.currentTime = i * interval;
126
+
127
+ await new Promise(seekResolve => {
128
+ video.addEventListener('seeked', async () => {
129
+ const canvas = document.createElement('canvas');
130
+ canvas.width = Math.min(video.videoWidth, 1280);
131
+ canvas.height = Math.min(video.videoHeight, 720);
132
+ const ctx = canvas.getContext('2d');
133
+ ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
134
+
135
+ const blob = await new Promise(blobResolve => {
136
+ canvas.toBlob(blobResolve, 'image/jpeg', 0.85);
137
+ });
138
+
139
+ frames.push({
140
+ blob,
141
+ timestamp: i * interval
142
+ });
143
+ captureCount++;
144
+
145
+ this.updateProgress((captureCount / numFrames) * 20, `Extracting frame ${captureCount}/${numFrames}...`);
146
+
147
+ if (captureCount === numFrames) {
148
+ resolve(frames);
149
+ }
150
+
151
+ seekResolve();
152
+ }, { once: true });
153
+ });
154
+ }
155
+ });
156
+
157
+ video.addEventListener('error', reject);
158
+ video.src = videoUrl;
159
+ video.load();
160
+ });
161
+ }
162
 
163
+ async initializeModel() {
164
+ const device = this.elements.deviceSelect.value;
165
+ const model_id = "onnx-community/FastVLM-0.5B-ONNX";
166
+
167
+ this.updateProgress(25, 'Loading AI model...');
168
+
169
+ try {
170
+ this.processor = await AutoProcessor.from_pretrained(model_id);
171
+
172
+ this.updateProgress(50, 'Initializing model...');
173
+
174
+ const modelOptions = {
175
+ dtype: {
176
+ embed_tokens: "fp16",
177
+ vision_encoder: "q4",
178
+ decoder_model_merged: "q4",
179
+ }
180
+ };
181
+
182
+ if (device === 'webgpu') {
183
+ modelOptions.device = 'webgpu';
184
+ }
185
+
186
+ this.model = await AutoModelForImageTextToText.from_pretrained(model_id, modelOptions);
187
+
188
+ this.updateProgress(60, 'Model ready');
189
+ } catch (error) {
190
+ console.error('Model initialization error:', error);
191
+ throw error;
192
+ }
193
+ }
194
+
195
+ async processVideo() {
196
+ this.isProcessing = true;
197
+ this.elements.processBtn.classList.add('loading');
198
+ this.elements.progressOverlay.classList.add('active');
199
+ this.elements.results.style.display = 'none';
200
+ this.elements.frameCaptions.innerHTML = '';
201
+
202
+ try {
203
+ const videoURL = URL.createObjectURL(this.videoFile);
204
+ const numFrames = parseInt(this.elements.frameCount.value);
205
+
206
+ this.updateProgress(0, 'Starting...');
207
+
208
+ const frames = await this.extractFramesFromVideo(videoURL, numFrames);
209
+
210
+ if (!this.model) {
211
+ await this.initializeModel();
212
+ }
213
+
214
+ const allCaptions = [];
215
+ const totalSteps = frames.length;
216
+
217
+ for (let i = 0; i < frames.length; i++) {
218
+ const progress = 60 + (i / totalSteps) * 30;
219
+ this.updateProgress(progress, `Analyzing frame ${i + 1}/${totalSteps}...`);
220
+
221
+ const frameUrl = URL.createObjectURL(frames[i].blob);
222
+ const image = await load_image(frameUrl);
223
+
224
+ const messages = [
225
+ {
226
+ role: "user",
227
+ content: `<image>This is frame ${i + 1} of ${numFrames} from a video at ${frames[i].timestamp.toFixed(1)}s. Describe what's happening in this frame, focusing on actions, objects, and any notable changes.`,
228
+ },
229
+ ];
230
+
231
+ const prompt = this.processor.apply_chat_template(messages, {
232
+ add_generation_prompt: true,
233
+ });
234
+
235
+ const inputs = await this.processor(image, prompt, {
236
+ add_special_tokens: false,
237
+ });
238
+
239
+ let captionText = '';
240
+ const streamer = new TextStreamer(this.processor.tokenizer, {
241
+ skip_prompt: true,
242
+ skip_special_tokens: false,
243
+ callback_function: (text) => {
244
+ captionText += text;
245
+ }
246
+ });
247
+
248
+ const outputs = await this.model.generate({
249
+ ...inputs,
250
+ max_new_tokens: 256,
251
+ do_sample: false,
252
+ streamer: streamer,
253
+ });
254
+
255
+ allCaptions.push({
256
+ frame: i + 1,
257
+ timestamp: frames[i].timestamp,
258
+ caption: captionText.trim()
259
+ });
260
+
261
+ this.displayFrameCaption(allCaptions[allCaptions.length - 1]);
262
+
263
+ URL.revokeObjectURL(frameUrl);
264
+ }
265
+
266
+ this.updateProgress(95, 'Generating video summary...');
267
+
268
+ await this.generateVideoSummary(frames[frames.length - 1], allCaptions);
269
+
270
+ this.updateProgress(100, 'Complete!');
271
+
272
+ setTimeout(() => {
273
+ this.elements.progressOverlay.classList.remove('active');
274
+ this.elements.results.style.display = 'block';
275
+ }, 500);
276
+
277
+ } catch (error) {
278
+ console.error('Processing error:', error);
279
+ alert('An error occurred while processing the video. Please try again.');
280
+ } finally {
281
+ this.isProcessing = false;
282
+ this.elements.processBtn.classList.remove('loading');
283
+ }
284
+ }
285
 
286
+ displayFrameCaption(captionData) {
287
+ const captionElement = document.createElement('div');
288
+ captionElement.className = 'frame-caption-item';
289
+ captionElement.innerHTML = `
290
+ <div class="frame-header">
291
+ <span class="frame-number">Frame ${captionData.frame}</span>
292
+ <span class="frame-time">${this.formatTime(captionData.timestamp)}</span>
293
+ </div>
294
+ <p class="frame-text">${captionData.caption}</p>
295
+ `;
296
+ this.elements.frameCaptions.appendChild(captionElement);
297
+ }
298
 
299
+ async generateVideoSummary(lastFrame, allCaptions) {
300
+ const frameUrl = URL.createObjectURL(lastFrame.blob);
301
+ const image = await load_image(frameUrl);
302
+
303
+ const summaryPrompt = this.processor.apply_chat_template([
304
+ {
305
+ role: "user",
306
+ content: `<image>Based on this frame and knowing that the video shows: ${
307
+ allCaptions.map(fc => fc.caption).join('; ')
308
+ }, provide a comprehensive caption for the entire video.`
309
+ }
310
+ ], { add_generation_prompt: true });
311
+
312
+ const summaryInputs = await this.processor(image, summaryPrompt, {
313
+ add_special_tokens: false,
314
  });
315
+
316
+ let summaryText = '';
317
+ const streamer = new TextStreamer(this.processor.tokenizer, {
318
+ skip_prompt: true,
319
+ skip_special_tokens: false,
320
+ callback_function: (text) => {
321
+ summaryText += text;
322
+ this.elements.summaryText.textContent = summaryText;
323
+ }
324
+ });
325
+
326
+ await this.model.generate({
327
+ ...summaryInputs,
328
+ max_new_tokens: 512,
329
+ do_sample: false,
330
+ streamer: streamer,
331
+ });
332
+
333
+ URL.revokeObjectURL(frameUrl);
334
+ }
335
 
336
+ formatTime(seconds) {
337
+ const mins = Math.floor(seconds / 60);
338
+ const secs = Math.floor(seconds % 60);
339
+ return `${mins}:${secs.toString().padStart(2, '0')}`;
340
+ }
341
+
342
+ async copyResults() {
343
+ const frameCaptions = Array.from(this.elements.frameCaptions.querySelectorAll('.frame-caption-item'))
344
+ .map(el => el.querySelector('.frame-text').textContent)
345
+ .join('\n\n');
346
+
347
+ const summary = this.elements.summaryText.textContent;
348
+ const fullText = `Frame Captions:\n${frameCaptions}\n\nVideo Summary:\n${summary}`;
349
+
350
+ try {
351
+ await navigator.clipboard.writeText(fullText);
352
+ this.elements.copyBtn.classList.add('copied');
353
+ setTimeout(() => {
354
+ this.elements.copyBtn.classList.remove('copied');
355
+ }, 2000);
356
+ } catch (err) {
357
+ console.error('Failed to copy:', err);
358
+ }
359
+ }
 
 
360
  }
361
+
362
+ document.addEventListener('DOMContentLoaded', () => {
363
+ new VideoCaptionApp();
364
+ });