akhaliq's picture
akhaliq HF Staff
Upload index.js with huggingface_hub
da1592a verified
raw
history blame
11.6 kB
import {
AutoProcessor,
AutoModelForImageTextToText,
TextStreamer,
} from 'https://cdn.jsdelivr.net/npm/@huggingface/[email protected]';
class VideoCaptionApp {
constructor() {
this.videoFile = null;
this.videoUrl = null;
this.processor = null;
this.model = null;
this.useGPU = false;
this.isProcessing = false;
this.initializeElements();
this.bindEvents();
this.checkWebGPUSupport();
}
initializeElements() {
this.elements = {
videoPlayer: document.getElementById('videoPlayer'),
fileInput: document.getElementById('fileInput'),
uploadBtn: document.getElementById('uploadBtn'),
analyzeBtn: document.getElementById('analyzeBtn'),
captionPanel: document.getElementById('captionPanel'),
closePanel: document.getElementById('closePanel'),
loadingOverlay: document.getElementById('loadingOverlay'),
loadingText: document.getElementById('loadingText'),
progressFill: document.getElementById('progressFill'),
resultsContainer: document.getElementById('resultsContainer'),
frameCount: document.getElementById('frameCount'),
frameCountValue: document.getElementById('frameCountValue'),
maxDuration: document.getElementById('maxDuration'),
maxDurationValue: document.getElementById('maxDurationValue'),
toggleDevice: document.getElementById('toggleDevice'),
videoPlaceholder: document.querySelector('.video-placeholder')
};
}
bindEvents() {
this.elements.uploadBtn.addEventListener('click', () => {
this.elements.fileInput.click();
});
this.elements.fileInput.addEventListener('change', (e) => {
this.handleFileUpload(e);
});
this.elements.analyzeBtn.addEventListener('click', () => {
this.analyzeVideo();
});
this.elements.closePanel.addEventListener('click', () => {
this.elements.captionPanel.classList.remove('active');
});
this.elements.frameCount.addEventListener('input', (e) => {
this.elements.frameCountValue.textContent = e.target.value;
});
this.elements.maxDuration.addEventListener('input', (e) => {
this.elements.maxDurationValue.textContent = e.target.value;
});
this.elements.toggleDevice.addEventListener('click', () => {
this.toggleDevice();
});
// Drag and drop support
const app = document.getElementById('app');
app.addEventListener('dragover', (e) => {
e.preventDefault();
app.classList.add('dragging');
});
app.addEventListener('dragleave', () => {
app.classList.remove('dragging');
});
app.addEventListener('drop', (e) => {
e.preventDefault();
app.classList.remove('dragging');
const files = e.dataTransfer.files;
if (files.length > 0 && files[0].type.startsWith('video/')) {
this.loadVideo(files[0]);
}
});
}
async checkWebGPUSupport() {
if ('gpu' in navigator) {
try {
const adapter = await navigator.gpu.requestAdapter();
if (adapter) {
this.elements.toggleDevice.style.display = 'flex';
return;
}
} catch (e) {
console.log('WebGPU not supported:', e);
}
}
this.elements.toggleDevice.style.display = 'none';
}
toggleDevice() {
this.useGPU = !this.useGPU;
const deviceLabel = this.elements.toggleDevice.querySelector('.device-label');
deviceLabel.textContent = this.useGPU ? 'GPU' : 'CPU';
this.elements.toggleDevice.classList.toggle('gpu-active', this.useGPU);
// Reset model to force reload with new device
this.model = null;
this.processor = null;
}
handleFileUpload(event) {
const file = event.target.files[0];
if (file && file.type.startsWith('video/')) {
this.loadVideo(file);
}
}
loadVideo(file) {
this.videoFile = file;
this.videoUrl = URL.createObjectURL(file);
this.elements.videoPlayer.src = this.videoUrl;
this.elements.videoPlayer.style.display = 'block';
this.elements.videoPlaceholder.style.display = 'none';
this.elements.captionPanel.classList.add('active');
this.elements.videoPlayer.addEventListener('loadedmetadata', () => {
console.log('Video loaded:', {
duration: this.elements.videoPlayer.duration,
width: this.elements.videoPlayer.videoWidth,
height: this.elements.videoPlayer.videoHeight
});
});
}
async extractFramesFromVideo(options = {}) {
const {
frameCount = 8,
maxDuration = 10,
} = options;
return new Promise((resolve, reject) => {
const video = document.createElement('video');
video.crossOrigin = 'anonymous';
video.muted = true;
video.addEventListener('loadedmetadata', async () => {
const duration = Math.min(video.duration, maxDuration);
const interval = duration / frameCount;
const frames = [];
const canvas = document.createElement('canvas');
const ctx = canvas.getContext('2d');
canvas.width = Math.min(video.videoWidth, 640);
canvas.height = Math.min(video.videoHeight, 480);
for (let i = 0; i < frameCount; i++) {
const time = i * interval;
video.currentTime = time;
await new Promise(resolve => {
video.addEventListener('seeked', () => {
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
canvas.toBlob((blob) => {
frames.push({
time: time.toFixed(2),
blob
});
this.updateProgress((i + 1) / frameCount * 20);
resolve();
}, 'image/jpeg', 0.8);
}, { once: true });
});
}
resolve(frames);
});
video.addEventListener('error', (e) => {
reject(new Error(`Failed to load video: ${e.message}`));
});
video.src = this.videoUrl;
video.load();
});
}
async loadImageFromSource(blob) {
const bitmap = await createImageBitmap(blob);
return bitmap;
}
updateProgress(percentage) {
this.elements.progressFill.style.width = `${percentage}%`;
}
async initializeModel() {
if (!this.model || !this.processor) {
this.elements.loadingText.textContent = 'Loading AI model...';
const model_id = "onnx-community/FastVLM-0.5B-ONNX";
this.processor = await AutoProcessor.from_pretrained(model_id);
const deviceConfig = this.useGPU ? { device: 'webgpu' } : {};
this.model = await AutoModelForImageTextToText.from_pretrained(model_id, {
...deviceConfig,
dtype: {
embed_tokens: "fp16",
vision_encoder: "q4",
decoder_model_merged: "q4",
},
});
this.updateProgress(30);
}
}
async analyzeVideo() {
if (this.isProcessing || !this.videoUrl) return;
this.isProcessing = true;
this.elements.loadingOverlay.classList.add('active');
this.elements.analyzeBtn.disabled = true;
this.elements.resultsContainer.innerHTML = '';
this.updateProgress(0);
try {
await this.initializeModel();
const frameCount = parseInt(this.elements.frameCount.value);
const maxDuration = parseInt(this.elements.maxDuration.value);
this.elements.loadingText.textContent = 'Extracting frames...';
const frames = await this.extractFramesFromVideo({ frameCount, maxDuration });
const frameAnalyses = [];
const progressPerFrame = 60 / frameCount;
for (let i = 0; i < frames.length; i++) {
const frame = frames[i];
this.elements.loadingText.textContent = `Analyzing frame ${i + 1} of ${frameCount}...`;
const messages = [{
role: "user",
content: `Frame at ${frame.time}s: <image>Describe what's happening in this frame in detail.`,
}];
const prompt = this.processor.apply_chat_template(messages, {
add_generation_prompt: true,
});
const image = await this.loadImageFromSource(frame.blob);
const inputs = await this.processor(image, prompt, {
add_special_tokens: false,
});
const outputs = await this.model.generate({
...inputs,
max_new_tokens: 256,
do_sample: false,
});
const decoded = this.processor.batch_decode(
outputs.slice(null, [inputs.input_ids.dims.at(-1), null]),
{ skip_special_tokens: true },
);
frameAnalyses.push({
time: frame.time,
description: decoded[0],
});
this.updateProgress(30 + (i + 1) * progressPerFrame);
}
this.elements.loadingText.textContent = 'Generating summary...';
// Generate summary
const summaryPrompt = `Based on these frame descriptions from a video, provide a coherent summary of what happens in the video:\n\n${
frameAnalyses.map(f => `At ${f.time}s: ${f.description}`).join('\n')
}\n\nVideo Summary:`;
const messages = [{
role: "user",
content: summaryPrompt,
}];
const prompt = this.processor.apply_chat_template(messages, {
add_generation_prompt: true,
});
const firstFrameImage = await this.loadImageFromSource(frames[0].blob);
const inputs = await this.processor(firstFrameImage, prompt, {
add_special_tokens: false,
});
const outputs = await this.model.generate({
...inputs,
max_new_tokens: 512,
do_sample: false,
});
const decoded = this.processor.batch_decode(
outputs.slice(null, [inputs.input_ids.dims.at(-1), null]),
{ skip_special_tokens: true },
);
const summary = decoded[0];
this.updateProgress(100);
this.displayResults({ frames: frameAnalyses, summary });
} catch (error) {
console.error('Analysis error:', error);
this.showError(error.message);
} finally {
this.isProcessing = false;
this.elements.loadingOverlay.classList.remove('active');
this.elements.analyzeBtn.disabled = false;
}
}
displayResults(analysis) {
let html = `
<div class="summary-section">
<h3>Video Summary</h3>
<p>${analysis.summary || 'No summary available'}</p>
</div>
<div class="frames-section">
<h3>Frame-by-Frame Analysis</h3>
<div class="frame-list">
`;
analysis.frames.forEach(frame => {
html += `
<div class="frame-item">
<div class="frame-time">${frame.time}s</div>
<div class="frame-description">${frame.description}</div>
</div>
`;
});
html += `
</div>
</div>
`;
this.elements.resultsContainer.innerHTML = html;
}
showError(message) {
this.elements.resultsContainer.innerHTML = `
<div class="error-message">
<svg width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
<circle cx="12" cy="12" r="10"></circle>
<line x1="12" y1="8" x2="12" y2="12"></line>
<line x1="12" y1="16" x2="12.01" y2="16"></line>
</svg>
<p>Error: ${message}</p>
</div>
`;
}
}
// Initialize app when DOM is ready
document.addEventListener('DOMContentLoaded', () => {
new VideoCaptionApp();
});