Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import torch | |
import json | |
import os | |
from pathlib import Path | |
class VideoRetrieval: | |
def __init__(self, use_dummy_data=True): | |
self.text_model = SentenceTransformer('all-MiniLM-L6-v2') | |
if use_dummy_data: | |
self.create_dummy_data() | |
else: | |
self.load_data() | |
def create_dummy_data(self): | |
"""Create dummy features and metadata for demonstration""" | |
# Create dummy features | |
n_clips = 20 | |
feature_dim = 384 # matching the dimension of all-MiniLM-L6-v2 | |
self.features = { | |
'visual_features': np.random.randn(n_clips, feature_dim), | |
'scene_features': np.random.randn(n_clips, feature_dim), | |
'object_features': np.random.randn(n_clips, feature_dim) | |
} | |
# Create dummy metadata | |
movie_titles = [ | |
"The Matrix", "Inception", "The Dark Knight", "Pulp Fiction", | |
"The Shawshank Redemption", "Forrest Gump", "The Godfather", | |
"Fight Club", "Interstellar", "The Silence of the Lambs" | |
] | |
descriptions = [ | |
"A dramatic confrontation in a dark room where the truth is revealed", | |
"A high-stakes chase through a crowded city street", | |
"An emotional reunion between long-lost friends", | |
"A tense negotiation that determines the fate of many", | |
"A quiet moment of reflection before a life-changing decision" | |
] | |
# Sample YouTube clips (famous movie scenes) | |
youtube_clips = [ | |
"https://www.youtube.com/watch?v=kcsNbQRU5TI", # Matrix - Red Pill Blue Pill | |
"https://www.youtube.com/watch?v=YoHD9XEInc0", # Inception - Hallway Fight | |
"https://www.youtube.com/watch?v=ZWCAf-xLV2k", # Dark Knight - Interrogation | |
"https://www.youtube.com/watch?v=Jomr9SAjcyw", # Pulp Fiction - Restaurant | |
"https://www.youtube.com/watch?v=SQ7_5MMbPYs", # Shawshank - Hope Speech | |
] | |
data = [] | |
for i in range(n_clips): | |
data.append({ | |
'clip_id': f'clip_{i}', | |
'movie_title': movie_titles[i % len(movie_titles)], | |
'description': descriptions[i % len(descriptions)], | |
'timestamp': f'{(i*5):02d}:00 - {(i*5+3):02d}:00', | |
'duration': '3:00', | |
'youtube_url': youtube_clips[i % len(youtube_clips)] | |
}) | |
self.clips_df = pd.DataFrame(data) | |
def load_data(self): | |
"""Load actual pre-computed features and metadata""" | |
try: | |
self.features = { | |
'visual_features': np.load('path_to_visual_features.npy'), | |
'scene_features': np.load('path_to_scene_features.npy'), | |
'object_features': np.load('path_to_object_features.npy') | |
} | |
self.clips_df = pd.read_csv('clips_metadata.csv') | |
except FileNotFoundError as e: | |
st.error(f"Error loading data: {e}. Falling back to dummy data.") | |
self.create_dummy_data() | |
def encode_query(self, query_text): | |
"""Encode the text query into embeddings""" | |
return self.text_model.encode(query_text) | |
def compute_similarity(self, query_embedding, feature_type='visual_features'): | |
"""Compute similarity between query and video features""" | |
similarities = cosine_similarity( | |
query_embedding.reshape(1, -1), | |
self.features[feature_type] | |
) | |
return similarities[0] | |
def retrieve_clips(self, query_text, top_k=3): | |
"""Retrieve top-k most relevant clips based on query""" | |
# Encode query | |
query_embedding = self.encode_query(query_text) | |
# Compute similarities for different feature types | |
similarities = {} | |
weights = { | |
'visual_features': 0.4, | |
'scene_features': 0.3, | |
'object_features': 0.3 | |
} | |
for feat_type, weight in weights.items(): | |
similarities[feat_type] = self.compute_similarity(query_embedding, feat_type) * weight | |
# Combine similarities | |
combined_similarities = sum(similarities.values()) | |
# Get top-k indices | |
top_indices = np.argsort(combined_similarities)[-top_k:][::-1] | |
# Return clip information | |
results = [] | |
for idx in top_indices: | |
results.append({ | |
'clip_id': self.clips_df.iloc[idx]['clip_id'], | |
'movie_title': self.clips_df.iloc[idx]['movie_title'], | |
'description': self.clips_df.iloc[idx]['description'], | |
'timestamp': self.clips_df.iloc[idx]['timestamp'], | |
'youtube_url': self.clips_df.iloc[idx]['youtube_url'], | |
'similarity_score': float(combined_similarities[idx]) # Convert to float for JSON serialization | |
}) | |
return results | |
def main(): | |
st.set_page_config( | |
page_title="Movie Scene Retrieval System", | |
page_icon="π¬", | |
layout="wide" | |
) | |
st.title("π¬ Movie Scene Retrieval System") | |
st.write(""" | |
Search for movie scenes using natural language descriptions. | |
The system will retrieve the most relevant 2-3 minute clips based on your query. | |
*Note: This is a demo version using simulated data.* | |
""") | |
# Initialize retrieval system | |
try: | |
retrieval_system = st.session_state.retrieval_system | |
except AttributeError: | |
retrieval_system = VideoRetrieval(use_dummy_data=True) | |
st.session_state.retrieval_system = retrieval_system | |
# Search interface | |
col1, col2 = st.columns([3, 1]) | |
with col1: | |
query = st.text_input( | |
"Enter your scene description:", | |
placeholder="e.g., A dramatic confrontation between two characters in a dark room" | |
) | |
with col2: | |
num_results = st.slider("Number of results:", min_value=1, max_value=5, value=3) | |
if st.button("π Search", type="primary"): | |
if not query: | |
st.warning("Please enter a scene description.") | |
else: | |
with st.spinner("Searching for relevant clips..."): | |
results = retrieval_system.retrieve_clips(query, top_k=num_results) | |
for i, result in enumerate(results, 1): | |
with st.container(): | |
st.subheader(f"{result['movie_title']}") | |
cols = st.columns([2, 1]) | |
with cols[0]: | |
st.markdown(f"**Scene Description:**") | |
st.write(result['description']) | |
st.text(f"β±οΈ Timestamp: {result['timestamp']}") | |
# Add video player | |
if result['youtube_url']: | |
st.video(result['youtube_url']) | |
with cols[1]: | |
st.markdown("**Relevance Score:**") | |
score = min(1.0, max(0.0, result['similarity_score'])) | |
st.progress(score) | |
st.text(f"{score:.2%} match") | |
# Add direct YouTube link | |
st.markdown(f"[π Watch on YouTube]({result['youtube_url']})") | |
st.text("Click to open in a new tab") | |
st.divider() | |
# Sidebar with additional information | |
with st.sidebar: | |
st.header("βΉοΈ About") | |
st.write(""" | |
This demo system simulates a video retrieval engine that uses: | |
- π₯ Visual scene understanding | |
- π₯ Character interaction analysis | |
- π― Object detection | |
- π Action recognition | |
In a production system, these features would be pre-computed | |
from actual movie clips using state-of-the-art AI models. | |
""") | |
st.header("βοΈ Feature Weights") | |
st.write("Current weights used for similarity computation:") | |
st.write("- π¬ Visual Features: 40%") | |
st.write("- ποΈ Scene Features: 30%") | |
st.write("- π¦ Object Features: 30%") | |
if __name__ == "__main__": | |
main() |