Spaces:
Running
Running
import { | |
AutoProcessor, | |
AutoModelForImageTextToText, | |
TextStreamer, | |
} from 'https://cdn.jsdelivr.net/npm/@huggingface/[email protected]'; | |
class VideoCaptionApp { | |
constructor() { | |
this.videoFile = null; | |
this.videoUrl = null; | |
this.processor = null; | |
this.model = null; | |
this.useGPU = false; | |
this.isProcessing = false; | |
this.initializeElements(); | |
this.bindEvents(); | |
this.checkWebGPUSupport(); | |
} | |
initializeElements() { | |
this.elements = { | |
videoPlayer: document.getElementById('videoPlayer'), | |
fileInput: document.getElementById('fileInput'), | |
uploadBtn: document.getElementById('uploadBtn'), | |
analyzeBtn: document.getElementById('analyzeBtn'), | |
captionPanel: document.getElementById('captionPanel'), | |
closePanel: document.getElementById('closePanel'), | |
loadingOverlay: document.getElementById('loadingOverlay'), | |
loadingText: document.getElementById('loadingText'), | |
progressFill: document.getElementById('progressFill'), | |
resultsContainer: document.getElementById('resultsContainer'), | |
frameCount: document.getElementById('frameCount'), | |
frameCountValue: document.getElementById('frameCountValue'), | |
maxDuration: document.getElementById('maxDuration'), | |
maxDurationValue: document.getElementById('maxDurationValue'), | |
toggleDevice: document.getElementById('toggleDevice'), | |
videoPlaceholder: document.querySelector('.video-placeholder') | |
}; | |
} | |
bindEvents() { | |
this.elements.uploadBtn.addEventListener('click', () => { | |
this.elements.fileInput.click(); | |
}); | |
this.elements.fileInput.addEventListener('change', (e) => { | |
this.handleFileUpload(e); | |
}); | |
this.elements.analyzeBtn.addEventListener('click', () => { | |
this.analyzeVideo(); | |
}); | |
this.elements.closePanel.addEventListener('click', () => { | |
this.elements.captionPanel.classList.remove('active'); | |
}); | |
this.elements.frameCount.addEventListener('input', (e) => { | |
this.elements.frameCountValue.textContent = e.target.value; | |
}); | |
this.elements.maxDuration.addEventListener('input', (e) => { | |
this.elements.maxDurationValue.textContent = e.target.value; | |
}); | |
this.elements.toggleDevice.addEventListener('click', () => { | |
this.toggleDevice(); | |
}); | |
// Drag and drop support | |
const app = document.getElementById('app'); | |
app.addEventListener('dragover', (e) => { | |
e.preventDefault(); | |
app.classList.add('dragging'); | |
}); | |
app.addEventListener('dragleave', () => { | |
app.classList.remove('dragging'); | |
}); | |
app.addEventListener('drop', (e) => { | |
e.preventDefault(); | |
app.classList.remove('dragging'); | |
const files = e.dataTransfer.files; | |
if (files.length > 0 && files[0].type.startsWith('video/')) { | |
this.loadVideo(files[0]); | |
} | |
}); | |
} | |
async checkWebGPUSupport() { | |
if ('gpu' in navigator) { | |
try { | |
const adapter = await navigator.gpu.requestAdapter(); | |
if (adapter) { | |
this.elements.toggleDevice.style.display = 'flex'; | |
return; | |
} | |
} catch (e) { | |
console.log('WebGPU not supported:', e); | |
} | |
} | |
this.elements.toggleDevice.style.display = 'none'; | |
} | |
toggleDevice() { | |
this.useGPU = !this.useGPU; | |
const deviceLabel = this.elements.toggleDevice.querySelector('.device-label'); | |
deviceLabel.textContent = this.useGPU ? 'GPU' : 'CPU'; | |
this.elements.toggleDevice.classList.toggle('gpu-active', this.useGPU); | |
// Reset model to force reload with new device | |
this.model = null; | |
this.processor = null; | |
} | |
handleFileUpload(event) { | |
const file = event.target.files[0]; | |
if (file && file.type.startsWith('video/')) { | |
this.loadVideo(file); | |
} | |
} | |
loadVideo(file) { | |
this.videoFile = file; | |
this.videoUrl = URL.createObjectURL(file); | |
this.elements.videoPlayer.src = this.videoUrl; | |
this.elements.videoPlayer.style.display = 'block'; | |
this.elements.videoPlaceholder.style.display = 'none'; | |
this.elements.captionPanel.classList.add('active'); | |
this.elements.videoPlayer.addEventListener('loadedmetadata', () => { | |
console.log('Video loaded:', { | |
duration: this.elements.videoPlayer.duration, | |
width: this.elements.videoPlayer.videoWidth, | |
height: this.elements.videoPlayer.videoHeight | |
}); | |
}); | |
} | |
async extractFramesFromVideo(options = {}) { | |
const { | |
frameCount = 8, | |
maxDuration = 10, | |
} = options; | |
return new Promise((resolve, reject) => { | |
const video = document.createElement('video'); | |
video.crossOrigin = 'anonymous'; | |
video.muted = true; | |
video.addEventListener('loadedmetadata', async () => { | |
const duration = Math.min(video.duration, maxDuration); | |
const interval = duration / frameCount; | |
const frames = []; | |
const canvas = document.createElement('canvas'); | |
const ctx = canvas.getContext('2d'); | |
canvas.width = Math.min(video.videoWidth, 640); | |
canvas.height = Math.min(video.videoHeight, 480); | |
for (let i = 0; i < frameCount; i++) { | |
const time = i * interval; | |
video.currentTime = time; | |
await new Promise(resolve => { | |
video.addEventListener('seeked', () => { | |
ctx.drawImage(video, 0, 0, canvas.width, canvas.height); | |
canvas.toBlob((blob) => { | |
frames.push({ | |
time: time.toFixed(2), | |
blob | |
}); | |
this.updateProgress((i + 1) / frameCount * 20); | |
resolve(); | |
}, 'image/jpeg', 0.8); | |
}, { once: true }); | |
}); | |
} | |
resolve(frames); | |
}); | |
video.addEventListener('error', (e) => { | |
reject(new Error(`Failed to load video: ${e.message}`)); | |
}); | |
video.src = this.videoUrl; | |
video.load(); | |
}); | |
} | |
async loadImageFromSource(blob) { | |
const bitmap = await createImageBitmap(blob); | |
return bitmap; | |
} | |
updateProgress(percentage) { | |
this.elements.progressFill.style.width = `${percentage}%`; | |
} | |
async initializeModel() { | |
if (!this.model || !this.processor) { | |
this.elements.loadingText.textContent = 'Loading AI model...'; | |
const model_id = "onnx-community/FastVLM-0.5B-ONNX"; | |
this.processor = await AutoProcessor.from_pretrained(model_id); | |
const deviceConfig = this.useGPU ? { device: 'webgpu' } : {}; | |
this.model = await AutoModelForImageTextToText.from_pretrained(model_id, { | |
...deviceConfig, | |
dtype: { | |
embed_tokens: "fp16", | |
vision_encoder: "q4", | |
decoder_model_merged: "q4", | |
}, | |
}); | |
this.updateProgress(30); | |
} | |
} | |
async analyzeVideo() { | |
if (this.isProcessing || !this.videoUrl) return; | |
this.isProcessing = true; | |
this.elements.loadingOverlay.classList.add('active'); | |
this.elements.analyzeBtn.disabled = true; | |
this.elements.resultsContainer.innerHTML = ''; | |
this.updateProgress(0); | |
try { | |
await this.initializeModel(); | |
const frameCount = parseInt(this.elements.frameCount.value); | |
const maxDuration = parseInt(this.elements.maxDuration.value); | |
this.elements.loadingText.textContent = 'Extracting frames...'; | |
const frames = await this.extractFramesFromVideo({ frameCount, maxDuration }); | |
const frameAnalyses = []; | |
const progressPerFrame = 60 / frameCount; | |
for (let i = 0; i < frames.length; i++) { | |
const frame = frames[i]; | |
this.elements.loadingText.textContent = `Analyzing frame ${i + 1} of ${frameCount}...`; | |
const messages = [{ | |
role: "user", | |
content: `Frame at ${frame.time}s: <image>Describe what's happening in this frame in detail.`, | |
}]; | |
const prompt = this.processor.apply_chat_template(messages, { | |
add_generation_prompt: true, | |
}); | |
const image = await this.loadImageFromSource(frame.blob); | |
const inputs = await this.processor(image, prompt, { | |
add_special_tokens: false, | |
}); | |
const outputs = await this.model.generate({ | |
...inputs, | |
max_new_tokens: 256, | |
do_sample: false, | |
}); | |
const decoded = this.processor.batch_decode( | |
outputs.slice(null, [inputs.input_ids.dims.at(-1), null]), | |
{ skip_special_tokens: true }, | |
); | |
frameAnalyses.push({ | |
time: frame.time, | |
description: decoded[0], | |
}); | |
this.updateProgress(30 + (i + 1) * progressPerFrame); | |
} | |
this.elements.loadingText.textContent = 'Generating summary...'; | |
// Generate summary | |
const summaryPrompt = `Based on these frame descriptions from a video, provide a coherent summary of what happens in the video:\n\n${ | |
frameAnalyses.map(f => `At ${f.time}s: ${f.description}`).join('\n') | |
}\n\nVideo Summary:`; | |
const messages = [{ | |
role: "user", | |
content: summaryPrompt, | |
}]; | |
const prompt = this.processor.apply_chat_template(messages, { | |
add_generation_prompt: true, | |
}); | |
const firstFrameImage = await this.loadImageFromSource(frames[0].blob); | |
const inputs = await this.processor(firstFrameImage, prompt, { | |
add_special_tokens: false, | |
}); | |
const outputs = await this.model.generate({ | |
...inputs, | |
max_new_tokens: 512, | |
do_sample: false, | |
}); | |
const decoded = this.processor.batch_decode( | |
outputs.slice(null, [inputs.input_ids.dims.at(-1), null]), | |
{ skip_special_tokens: true }, | |
); | |
const summary = decoded[0]; | |
this.updateProgress(100); | |
this.displayResults({ frames: frameAnalyses, summary }); | |
} catch (error) { | |
console.error('Analysis error:', error); | |
this.showError(error.message); | |
} finally { | |
this.isProcessing = false; | |
this.elements.loadingOverlay.classList.remove('active'); | |
this.elements.analyzeBtn.disabled = false; | |
} | |
} | |
displayResults(analysis) { | |
let html = ` | |
<div class="summary-section"> | |
<h3>Video Summary</h3> | |
<p>${analysis.summary || 'No summary available'}</p> | |
</div> | |
<div class="frames-section"> | |
<h3>Frame-by-Frame Analysis</h3> | |
<div class="frame-list"> | |
`; | |
analysis.frames.forEach(frame => { | |
html += ` | |
<div class="frame-item"> | |
<div class="frame-time">${frame.time}s</div> | |
<div class="frame-description">${frame.description}</div> | |
</div> | |
`; | |
}); | |
html += ` | |
</div> | |
</div> | |
`; | |
this.elements.resultsContainer.innerHTML = html; | |
} | |
showError(message) { | |
this.elements.resultsContainer.innerHTML = ` | |
<div class="error-message"> | |
<svg width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | |
<circle cx="12" cy="12" r="10"></circle> | |
<line x1="12" y1="8" x2="12" y2="12"></line> | |
<line x1="12" y1="16" x2="12.01" y2="16"></line> | |
</svg> | |
<p>Error: ${message}</p> | |
</div> | |
`; | |
} | |
} | |
// Initialize app when DOM is ready | |
document.addEventListener('DOMContentLoaded', () => { | |
new VideoCaptionApp(); | |
}); |