|
import { |
|
AutoProcessor, |
|
AutoModelForImageTextToText, |
|
TextStreamer, |
|
} from 'https://cdn.jsdelivr.net/npm/@huggingface/[email protected]'; |
|
|
|
let processor = null; |
|
let model = null; |
|
let videoFile = null; |
|
let frames = []; |
|
let captions = []; |
|
|
|
|
|
const uploadArea = document.getElementById('uploadArea'); |
|
const videoInput = document.getElementById('videoInput'); |
|
const videoSection = document.getElementById('videoSection'); |
|
const videoPlayer = document.getElementById('videoPlayer'); |
|
const frameCanvas = document.getElementById('frameCanvas'); |
|
const processBtn = document.getElementById('processBtn'); |
|
const progressSection = document.getElementById('progressSection'); |
|
const progressFill = document.getElementById('progressFill'); |
|
const progressText = document.getElementById('progressText'); |
|
const resultsSection = document.getElementById('resultsSection'); |
|
const framesList = document.getElementById('framesList'); |
|
const deviceSelect = document.getElementById('deviceSelect'); |
|
|
|
|
|
async function checkWebGPU() { |
|
if (!navigator.gpu) { |
|
deviceSelect.querySelector('option[value="webgpu"]').disabled = true; |
|
deviceSelect.value = 'wasm'; |
|
} |
|
} |
|
|
|
|
|
async function initializeModel() { |
|
try { |
|
progressText.textContent = 'Loading processor...'; |
|
progressFill.style.width = '30%'; |
|
|
|
const model_id = 'onnx-community/FastVLM-0.5B-ONNX'; |
|
processor = await AutoProcessor.from_pretrained(model_id); |
|
|
|
progressText.textContent = 'Loading model (this may take a moment)...'; |
|
progressFill.style.width = '60%'; |
|
|
|
const device = deviceSelect.value === 'webgpu' ? 'webgpu' : 'wasm'; |
|
model = await AutoModelForImageTextToText.from_pretrained(model_id, { |
|
device: device, |
|
dtype: { |
|
embed_tokens: 'fp16', |
|
vision_encoder: 'q4', |
|
decoder_model_merged: 'q4', |
|
}, |
|
}); |
|
|
|
progressFill.style.width = '100%'; |
|
progressText.textContent = 'Model loaded successfully!'; |
|
|
|
return true; |
|
} catch (error) { |
|
console.error('Error initializing model:', error); |
|
progressText.textContent = 'Error loading model. Please refresh and try again.'; |
|
return false; |
|
} |
|
} |
|
|
|
|
|
uploadArea.addEventListener('click', () => videoInput.click()); |
|
|
|
uploadArea.addEventListener('dragover', (e) => { |
|
e.preventDefault(); |
|
uploadArea.classList.add('dragover'); |
|
}); |
|
|
|
uploadArea.addEventListener('dragleave', () => { |
|
uploadArea.classList.remove('dragover'); |
|
}); |
|
|
|
uploadArea.addEventListener('drop', (e) => { |
|
e.preventDefault(); |
|
uploadArea.classList.remove('dragover'); |
|
const files = e.dataTransfer.files; |
|
if (files.length > 0 && files[0].type.startsWith('video/')) { |
|
handleVideoFile(files[0]); |
|
} |
|
}); |
|
|
|
videoInput.addEventListener('change', (e) => { |
|
if (e.target.files.length > 0) { |
|
handleVideoFile(e.target.files[0]); |
|
} |
|
}); |
|
|
|
function handleVideoFile(file) { |
|
if (file.size > 100 * 1024 * 1024) { |
|
alert('File size exceeds 100MB limit'); |
|
return; |
|
} |
|
|
|
videoFile = file; |
|
const url = URL.createObjectURL(file); |
|
videoPlayer.src = url; |
|
videoSection.classList.remove('hidden'); |
|
resultsSection.classList.add('hidden'); |
|
frames = []; |
|
captions = []; |
|
} |
|
|
|
|
|
async function extractFrames() { |
|
const interval = parseInt(document.getElementById('frameInterval').value); |
|
const ctx = frameCanvas.getContext('2d'); |
|
const duration = videoPlayer.duration; |
|
frames = []; |
|
|
|
for (let time = 0; time < duration; time += interval) { |
|
videoPlayer.currentTime = time; |
|
await new Promise(resolve => { |
|
videoPlayer.onseeked = resolve; |
|
}); |
|
|
|
frameCanvas.width = videoPlayer.videoWidth; |
|
frameCanvas.height = videoPlayer.videoHeight; |
|
ctx.drawImage(videoPlayer, 0, 0); |
|
|
|
const blob = await new Promise(resolve => { |
|
frameCanvas.toBlob(resolve, 'image/jpeg', 0.9); |
|
}); |
|
|
|
frames.push({ |
|
time: time, |
|
blob: blob, |
|
dataUrl: await blobToDataUrl(blob) |
|
}); |
|
} |
|
|
|
return frames; |
|
} |
|
|
|
function blobToDataUrl(blob) { |
|
return new Promise((resolve) => { |
|
const reader = new FileReader(); |
|
reader.onloadend = () => resolve(reader.result); |
|
reader.readAsDataURL(blob); |
|
}); |
|
} |
|
|
|
|
|
async function generateCaption(imageDataUrl, frameIndex, totalFrames) { |
|
try { |
|
progressText.textContent = `Processing frame ${frameIndex + 1} of ${totalFrames}...`; |
|
progressFill.style.width = `${((frameIndex + 1) / totalFrames) * 100}%`; |
|
|
|
const messages = [ |
|
{ |
|
role: 'user', |
|
content: '<image>Describe this video frame in detail. What is happening in this scene?', |
|
}, |
|
]; |
|
|
|
const prompt = processor.apply_chat_template(messages, { |
|
add_generation_prompt: true, |
|
}); |
|
|
|
|
|
const img = new Image(); |
|
img.src = imageDataUrl; |
|
await new Promise(resolve => img.onload = resolve); |
|
|
|
const inputs = await processor(img, prompt, { |
|
add_special_tokens: false, |
|
}); |
|
|
|
let streamedText = ''; |
|
const outputs = await model.generate({ |
|
...inputs, |
|
max_new_tokens: 256, |
|
do_sample: false, |
|
streamer: new TextStreamer(processor.tokenizer, { |
|
skip_prompt: true, |
|
skip_special_tokens: false, |
|
callback_function: (text) => { |
|
streamedText += text; |
|
}, |
|
}), |
|
}); |
|
|
|
const decoded = processor.batch_decode( |
|
outputs.slice(null, [inputs.input_ids.dims.at(-1), null]), |
|
{ skip_special_tokens: true } |
|
); |
|
|
|
return decoded[0]; |
|
} catch (error) { |
|
console.error('Error generating caption:', error); |
|
return 'Error generating caption for this frame'; |
|
} |
|
} |
|
|
|
|
|
processBtn.addEventListener('click', async () => { |
|
if (!videoFile) return; |
|
|
|
processBtn.disabled = true; |
|
progressSection.classList.remove('hidden'); |
|
resultsSection.classList.add('hidden'); |
|
|
|
try { |
|
|
|
if (!model || !processor) { |
|
const success = await initializeModel(); |
|
if (!success) { |
|
processBtn.disabled = false; |
|
return; |
|
} |
|
} |
|
|
|
|
|
progressText.textContent = 'Extracting frames...'; |
|
progressFill.style.width = '20%'; |
|
frames = await extractFrames(); |
|
|
|
|
|
captions = []; |
|
for (let i = 0; i < frames.length; i++) { |
|
const caption = await generateCaption(frames[i].dataUrl, i, frames.length); |
|
captions.push({ |
|
time: frames[i].time, |
|
caption: caption, |
|
thumbnail: frames[i].dataUrl |
|
}); |
|
|
|
|
|
displayResults(); |
|
resultsSection.classList.remove('hidden'); |
|
} |
|
|
|
progressText.textContent = 'Processing complete!'; |
|
setTimeout(() => { |
|
progressSection.classList.add('hidden'); |
|
}, 2000); |
|
|
|
} catch (error) { |
|
console.error('Processing error:', error); |
|
progressText.textContent = 'Error processing video. Please try again.'; |
|
} |
|
|
|
processBtn.disabled = false; |
|
}); |
|
|
|
|
|
function displayResults() { |
|
framesList.innerHTML = ''; |
|
|
|
captions.forEach((item, index) => { |
|
const frameCard = document.createElement('div'); |
|
frameCard.className = 'frame-card'; |
|
|
|
const time = formatTime(item.time); |
|
|
|
frameCard.innerHTML = ` |
|
<div class="frame-thumbnail"> |
|
<img src="${item.thumbnail}" alt="Frame at ${time}"> |
|
<span class="frame-time">${time}</span> |
|
</div> |
|
<div class="frame-caption"> |
|
<p>${item.caption}</p> |
|
</div> |
|
`; |
|
|
|
framesList.appendChild(frameCard); |
|
}); |
|
} |
|
|
|
function formatTime(seconds) { |
|
const mins = Math.floor(seconds / 60); |
|
const secs = Math.floor(seconds % 60); |
|
return `${mins.toString().padStart(2, '0')}:${secs.toString().padStart(2, '0')}`; |
|
} |
|
|
|
|
|
document.getElementById('exportJson').addEventListener('click', () => { |
|
const data = JSON.stringify(captions, null, 2); |
|
downloadFile(data, 'captions.json', 'application/json'); |
|
}); |
|
|
|
document.getElementById('exportSrt').addEventListener('click', () => { |
|
let srt = ''; |
|
captions.forEach((item, index) => { |
|
const startTime = formatSrtTime(item.time); |
|
const endTime = formatSrtTime(item.time + 5); |
|
srt += `${index + 1}\n${startTime} --> ${endTime}\n${item.caption}\n\n`; |
|
}); |
|
downloadFile(srt, 'captions.srt', 'text/plain'); |
|
}); |
|
|
|
document.getElementById('exportTxt').addEventListener('click', () => { |
|
let txt = ''; |
|
captions.forEach(item => { |
|
txt += `[${formatTime(item.time)}] ${item.caption}\n\n`; |
|
}); |
|
downloadFile(txt, 'captions.txt', 'text/plain'); |
|
}); |
|
|
|
function formatSrtTime(seconds) { |
|
const hours = Math.floor(seconds / 3600); |
|
const mins = Math.floor((seconds % 3600) / 60); |
|
const secs = Math.floor(seconds % 60); |
|
const ms = Math.floor((seconds % 1) * 1000); |
|
return `${hours.toString().padStart(2, '0')}:${mins.toString().padStart(2, '0')}:${secs.toString().padStart(2, '0')},${ms.toString().padStart(3, '0')}`; |
|
} |
|
|
|
function downloadFile(content, filename, type) { |
|
const blob = new Blob([content], { type }); |
|
const url = URL.createObjectURL(blob); |
|
const a = document.createElement('a'); |
|
a.href = url; |
|
a.download = filename; |
|
a.click(); |
|
URL.revokeObjectURL(url); |
|
} |
|
|
|
|
|
checkWebGPU(); |