Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from transformers import AutoProcessor, PaliGemmaForConditionalGeneration | |
| from PIL import Image | |
| import cv2 | |
| import torch | |
| # Load model and processor | |
| mix_model_id = "google/paligemma-3b-mix-224" | |
| mix_model = PaliGemmaForConditionalGeneration.from_pretrained(mix_model_id) | |
| mix_processor = AutoProcessor.from_pretrained(mix_model_id) | |
| # Define function to extract frames from the video | |
| def extract_frames(video_path, frame_interval=1): | |
| # Open the video file | |
| vidcap = cv2.VideoCapture(video_path) | |
| frames = [] | |
| success, image = vidcap.read() | |
| count = 0 | |
| while success: | |
| # Capture a frame at the specified interval | |
| if count % frame_interval == 0: | |
| frames.append(image) | |
| success, image = vidcap.read() | |
| count += 1 | |
| vidcap.release() | |
| return frames | |
| # Define function to generate captions for a video | |
| def process_video(video, prompt): | |
| # Use video directly as the path (video is passed as a string) | |
| frames = extract_frames(video, frame_interval=10) # Extract frames at intervals | |
| captions = [] | |
| for frame in frames: | |
| # Convert frame to PIL Image and process it (assuming mix_processor handles PIL Image) | |
| image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) | |
| inputs = mix_processor(image.convert("RGB"), prompt, return_tensors="pt") | |
| try: | |
| # Generate output from the model for each frame | |
| output = mix_model.generate(**inputs, max_new_tokens=20) | |
| # Decode and store the output for the frame | |
| decoded_output = mix_processor.decode(output[0], skip_special_tokens=True) | |
| captions.append(decoded_output[len(prompt):]) # Remove prompt part from the output | |
| except IndexError as e: | |
| print(f"IndexError: {e}") | |
| captions.append("Error processing frame") | |
| # Combine all frame captions into a coherent video description | |
| return " ".join(captions) | |
| # Define Gradio interface for video captioning | |
| inputs = [ | |
| gr.Video(label="Upload Video"), | |
| gr.Textbox(label="Prompt", placeholder="Enter your question") | |
| ] | |
| outputs = gr.Textbox(label="Generated Caption") | |
| # Create the Gradio app for video captioning | |
| demo = gr.Interface(fn=process_video, inputs=inputs, outputs=outputs, title="Video Captioning with Mix PaliGemma Model", | |
| description="Upload a video and get captions based on your prompt.") | |
| # Launch the app | |
| demo.launch(debug=True) |