import { AutoProcessor, AutoModelForImageTextToText, TextStreamer, } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.7.2'; let processor = null; let model = null; let videoFile = null; let frames = []; let captions = []; // DOM Elements const uploadArea = document.getElementById('uploadArea'); const videoInput = document.getElementById('videoInput'); const videoSection = document.getElementById('videoSection'); const videoPlayer = document.getElementById('videoPlayer'); const frameCanvas = document.getElementById('frameCanvas'); const processBtn = document.getElementById('processBtn'); const progressSection = document.getElementById('progressSection'); const progressFill = document.getElementById('progressFill'); const progressText = document.getElementById('progressText'); const resultsSection = document.getElementById('resultsSection'); const framesList = document.getElementById('framesList'); const deviceSelect = document.getElementById('deviceSelect'); // Check WebGPU support async function checkWebGPU() { if (!navigator.gpu) { deviceSelect.querySelector('option[value="webgpu"]').disabled = true; deviceSelect.value = 'wasm'; } } // Initialize model async function initializeModel() { try { progressText.textContent = 'Loading processor...'; progressFill.style.width = '30%'; const model_id = 'onnx-community/FastVLM-0.5B-ONNX'; processor = await AutoProcessor.from_pretrained(model_id); progressText.textContent = 'Loading model (this may take a moment)...'; progressFill.style.width = '60%'; const device = deviceSelect.value === 'webgpu' ? 'webgpu' : 'wasm'; model = await AutoModelForImageTextToText.from_pretrained(model_id, { device: device, dtype: { embed_tokens: 'fp16', vision_encoder: 'q4', decoder_model_merged: 'q4', }, }); progressFill.style.width = '100%'; progressText.textContent = 'Model loaded successfully!'; return true; } catch (error) { console.error('Error initializing model:', error); progressText.textContent = 'Error loading model. Please refresh and try again.'; return false; } } // Upload handling uploadArea.addEventListener('click', () => videoInput.click()); uploadArea.addEventListener('dragover', (e) => { e.preventDefault(); uploadArea.classList.add('dragover'); }); uploadArea.addEventListener('dragleave', () => { uploadArea.classList.remove('dragover'); }); uploadArea.addEventListener('drop', (e) => { e.preventDefault(); uploadArea.classList.remove('dragover'); const files = e.dataTransfer.files; if (files.length > 0 && files[0].type.startsWith('video/')) { handleVideoFile(files[0]); } }); videoInput.addEventListener('change', (e) => { if (e.target.files.length > 0) { handleVideoFile(e.target.files[0]); } }); function handleVideoFile(file) { if (file.size > 100 * 1024 * 1024) { alert('File size exceeds 100MB limit'); return; } videoFile = file; const url = URL.createObjectURL(file); videoPlayer.src = url; videoSection.classList.remove('hidden'); resultsSection.classList.add('hidden'); frames = []; captions = []; } // Extract frames from video async function extractFrames() { const interval = parseInt(document.getElementById('frameInterval').value); const ctx = frameCanvas.getContext('2d'); const duration = videoPlayer.duration; frames = []; for (let time = 0; time < duration; time += interval) { videoPlayer.currentTime = time; await new Promise(resolve => { videoPlayer.onseeked = resolve; }); frameCanvas.width = videoPlayer.videoWidth; frameCanvas.height = videoPlayer.videoHeight; ctx.drawImage(videoPlayer, 0, 0); const blob = await new Promise(resolve => { frameCanvas.toBlob(resolve, 'image/jpeg', 0.9); }); frames.push({ time: time, blob: blob, dataUrl: await blobToDataUrl(blob) }); } return frames; } function blobToDataUrl(blob) { return new Promise((resolve) => { const reader = new FileReader(); reader.onloadend = () => resolve(reader.result); reader.readAsDataURL(blob); }); } // Generate caption for a frame async function generateCaption(imageDataUrl, frameIndex, totalFrames) { try { progressText.textContent = `Processing frame ${frameIndex + 1} of ${totalFrames}...`; progressFill.style.width = `${((frameIndex + 1) / totalFrames) * 100}%`; const messages = [ { role: 'user', content: 'Describe this video frame in detail. What is happening in this scene?', }, ]; const prompt = processor.apply_chat_template(messages, { add_generation_prompt: true, }); // Create image element from data URL const img = new Image(); img.src = imageDataUrl; await new Promise(resolve => img.onload = resolve); const inputs = await processor(img, prompt, { add_special_tokens: false, }); let streamedText = ''; const outputs = await model.generate({ ...inputs, max_new_tokens: 256, do_sample: false, streamer: new TextStreamer(processor.tokenizer, { skip_prompt: true, skip_special_tokens: false, callback_function: (text) => { streamedText += text; }, }), }); const decoded = processor.batch_decode( outputs.slice(null, [inputs.input_ids.dims.at(-1), null]), { skip_special_tokens: true } ); return decoded[0]; } catch (error) { console.error('Error generating caption:', error); return 'Error generating caption for this frame'; } } // Process video processBtn.addEventListener('click', async () => { if (!videoFile) return; processBtn.disabled = true; progressSection.classList.remove('hidden'); resultsSection.classList.add('hidden'); try { // Initialize model if not already loaded if (!model || !processor) { const success = await initializeModel(); if (!success) { processBtn.disabled = false; return; } } // Extract frames progressText.textContent = 'Extracting frames...'; progressFill.style.width = '20%'; frames = await extractFrames(); // Generate captions captions = []; for (let i = 0; i < frames.length; i++) { const caption = await generateCaption(frames[i].dataUrl, i, frames.length); captions.push({ time: frames[i].time, caption: caption, thumbnail: frames[i].dataUrl }); // Update results in real-time displayResults(); resultsSection.classList.remove('hidden'); } progressText.textContent = 'Processing complete!'; setTimeout(() => { progressSection.classList.add('hidden'); }, 2000); } catch (error) { console.error('Processing error:', error); progressText.textContent = 'Error processing video. Please try again.'; } processBtn.disabled = false; }); // Display results function displayResults() { framesList.innerHTML = ''; captions.forEach((item, index) => { const frameCard = document.createElement('div'); frameCard.className = 'frame-card'; const time = formatTime(item.time); frameCard.innerHTML = `
Frame at ${time} ${time}

${item.caption}

`; framesList.appendChild(frameCard); }); } function formatTime(seconds) { const mins = Math.floor(seconds / 60); const secs = Math.floor(seconds % 60); return `${mins.toString().padStart(2, '0')}:${secs.toString().padStart(2, '0')}`; } // Export functions document.getElementById('exportJson').addEventListener('click', () => { const data = JSON.stringify(captions, null, 2); downloadFile(data, 'captions.json', 'application/json'); }); document.getElementById('exportSrt').addEventListener('click', () => { let srt = ''; captions.forEach((item, index) => { const startTime = formatSrtTime(item.time); const endTime = formatSrtTime(item.time + 5); srt += `${index + 1}\n${startTime} --> ${endTime}\n${item.caption}\n\n`; }); downloadFile(srt, 'captions.srt', 'text/plain'); }); document.getElementById('exportTxt').addEventListener('click', () => { let txt = ''; captions.forEach(item => { txt += `[${formatTime(item.time)}] ${item.caption}\n\n`; }); downloadFile(txt, 'captions.txt', 'text/plain'); }); function formatSrtTime(seconds) { const hours = Math.floor(seconds / 3600); const mins = Math.floor((seconds % 3600) / 60); const secs = Math.floor(seconds % 60); const ms = Math.floor((seconds % 1) * 1000); return `${hours.toString().padStart(2, '0')}:${mins.toString().padStart(2, '0')}:${secs.toString().padStart(2, '0')},${ms.toString().padStart(3, '0')}`; } function downloadFile(content, filename, type) { const blob = new Blob([content], { type }); const url = URL.createObjectURL(blob); const a = document.createElement('a'); a.href = url; a.download = filename; a.click(); URL.revokeObjectURL(url); } // Initialize checkWebGPU();