Spaces:

RajatMalviya
/

telecom

Runtime error

App Files Files Community

RajatMalviya commited on Mar 30

Commit

3517ef3

verified ·

1 Parent(s): 44a6a1d

Update app.py

Browse files

Files changed (1) hide show

app.py +340 -31

app.py CHANGED Viewed

@@ -1,48 +1,357 @@
-import streamlit as st
-import tempfile
 import os
-import librosa  # For audio resampling
 import torch
-from transformers import WhisperProcessor, WhisperForConditionalGeneration
-# Load the model and processor
-@st.cache_resource
-def load_model():
-    processor = WhisperProcessor.from_pretrained("ivrit-ai/whisper-large-v3-turbo")
-    model = WhisperForConditionalGeneration.from_pretrained("ivrit-ai/whisper-large-v3-turbo")
-    return processor, model
-processor, model = load_model()
-st.title("Hebrew Speech-to-Text Transcription")
-# Upload audio file
-uploaded_file = st.file_uploader("Upload an audio file (WAV, MP3, OGG)", type=["wav", "mp3", "ogg"])
-if uploaded_file is not None:
-    # Save the uploaded file to a temporary location
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
-        temp_audio.write(uploaded_file.read())
-        temp_audio_path = temp_audio.name
     try:
-        # Load and resample audio to 16kHz (required by Whisper)
-        speech_array, sampling_rate = librosa.load(temp_audio_path, sr=16000)
-        # Preprocess audio
-        inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt")
-        # Generate transcription
         with torch.no_grad():
-            predicted_ids = model.generate(inputs.input_features)
-        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-        st.subheader("Transcription:")
-        st.write(transcription)
     except Exception as e:
-        st.error(f"Error: {str(e)}")
-    # Clean up the temporary file
-    os.remove(temp_audio_path)

+import gradio as gr
+import ffmpeg
 import os
+import uuid
+import requests
+import tempfile
+import shutil
+import re
+import time
+import concurrent.futures
 import torch
+from pathlib import Path
+from dotenv import load_dotenv
+from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer
+from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
+from PIL import Image
+# Add GPU decorator for Hugging Face Spaces
+try:
+    from spaces import GPU
+    use_gpu = True
+    @GPU
+    def get_gpu():
+        return True
+    # Call the function to trigger GPU allocation
+    get_gpu()
+except ImportError:
+    use_gpu = False
+    print("Running without GPU acceleration")
+# Load environment variables from .env file if it exists
+load_dotenv()
+# Global variables to hold models (lazy loading)
+llava_model = None
+llava_processor = None
+stable_diffusion_pipeline = None
+def load_llava_model():
+    """Load LLaVA model for image captioning"""
+    global llava_model, llava_processor
+    if llava_model is None or llava_processor is None:
+        print("Loading LLaVA model for image analysis...")
+        model_id = "llava-hf/llava-1.5-7b-hf"
+        # Load processor and model (with reduced precision for memory efficiency)
+        llava_processor = AutoProcessor.from_pretrained(model_id)
+        llava_model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
+    return llava_model, llava_processor
+def load_stable_diffusion_model():
+    """Load Stable Diffusion model for Ghibli-style image generation"""
+    global stable_diffusion_pipeline
+    if stable_diffusion_pipeline is None:
+        print("Loading Stable Diffusion model for image generation...")
+        model_id = "nitrosocke/Ghibli-Diffusion"
+        # Load the pipeline with precision to balance memory usage and quality
+        stable_diffusion_pipeline = StableDiffusionPipeline.from_pretrained(
+            model_id,
+            torch_dtype=torch.float16,
+            safety_checker=None  # Disable safety checker for performance
+        )
+        # Move to GPU if available
+        if torch.cuda.is_available():
+            stable_diffusion_pipeline = stable_diffusion_pipeline.to("cuda")
+        # Use the DPM-Solver++ scheduler for better quality at lower steps
+        stable_diffusion_pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
+            stable_diffusion_pipeline.scheduler.config,
+            algorithm_type="dpmsolver++",
+            use_karras_sigmas=True
+        )
+    return stable_diffusion_pipeline
+def analyze_image_with_llava(image_path):
+    """Process a single frame with LLaVA to generate a description"""
     try:
+        # Load the model if not already loaded
+        model, processor = load_llava_model()
+        # Load the image
+        image = Image.open(image_path)
+        # Prompt for Ghibli-specific description
+        prompt = "Describe this image in detail, focusing on elements that would be important to recreate it in Studio Ghibli animation style."
+        # Process the image and generate text
+        inputs = processor(prompt, image, return_tensors="pt").to(model.device)
+        # Generate with appropriate parameters
         with torch.no_grad():
+            output = model.generate(
+                **inputs,
+                max_new_tokens=300,
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.9,
+            )
+        # Decode the output
+        generated_text = processor.decode(output[0], skip_special_tokens=True)
+        # Extract just the response part (remove the prompt)
+        response = generated_text.split(prompt)[-1].strip()
+        print(f"LLaVA analysis for frame {os.path.basename(image_path)}: {response[:150]}...")
+        return response
     except Exception as e:
+        import traceback
+        print(f"Error analyzing image {os.path.basename(image_path)}: {str(e)}")
+        print(traceback.format_exc())
+        return f"Error analyzing image: {str(e)}"
+def generate_ghibli_image(image_description, style_prompt, output_path):
+    """Generate a Ghibli-style image based on the description using Stable Diffusion"""
+    try:
+        # Load the model if not already loaded
+        pipeline = load_stable_diffusion_model()
+        # Combine the image description with the style prompt
+        full_prompt = f"{image_description}. {style_prompt}. Hand-drawn animation style, soft colors, attention to detail, Miyazaki aesthetic."
+        # Ensure prompt isn't too long
+        if len(full_prompt) > 500:
+            full_prompt = full_prompt[:497] + "..."
+        # Generate the image
+        with torch.autocast("cuda" if torch.cuda.is_available() else "cpu"):
+            image = pipeline(
+                prompt=full_prompt,
+                negative_prompt="3d, cgi, low quality, blurry, distorted, deformed",
+                num_inference_steps=30,
+                guidance_scale=7.5,
+                width=768,
+                height=768,
+            ).images[0]
+        # Save the generated image
+        image.save(output_path)
+        print(f"Successfully saved stylized frame: {os.path.basename(output_path)}")
+        return True
+    except Exception as e:
+        import traceback
+        print(f"Error generating image: {str(e)}")
+        print(traceback.format_exc())
+        return False
+def process_frame(frame_path, style_prompt):
+    """Process a single frame with LLaVA analysis and Stable Diffusion generation"""
+    try:
+        # First use LLaVA to analyze the image
+        image_description = analyze_image_with_llava(frame_path)
+        if image_description.startswith("Error"):
+            return False
+        # Now use Stable Diffusion to generate a stylized version
+        result = generate_ghibli_image(image_description, style_prompt, frame_path)
+        return result
+    except Exception as e:
+        import traceback
+        print(f"Error processing frame {os.path.basename(frame_path)}: {str(e)}")
+        print(traceback.format_exc())
+        return False
+def stylize_video(video_path, style_prompt, num_frames=15):
+    try:
+        # Create temp directories
+        temp_dir = tempfile.mkdtemp()
+        input_filename = os.path.join(temp_dir, "input.mp4")
+        frames_dir = os.path.join(temp_dir, "frames")
+        os.makedirs(frames_dir, exist_ok=True)
+        # Save the input video to a temporary file
+        if isinstance(video_path, str):
+            if video_path.startswith('http'):
+                # It's a URL, download it
+                response = requests.get(video_path, stream=True)
+                with open(input_filename, 'wb') as f:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        f.write(chunk)
+            elif os.path.exists(video_path):
+                # It's a file path, copy it
+                shutil.copy(video_path, input_filename)
+            else:
+                return None, f"Video file not found: {video_path}"
+        else:
+            # Assume it's binary data
+            with open(input_filename, "wb") as f:
+                f.write(video_path)
+        # Make sure the video file exists
+        if not os.path.exists(input_filename):
+            return None, "Failed to save input video"
+        # Extract frames - using lower fps for longer videos (1 frame per second)
+        ffmpeg.input(input_filename).output(f"{frames_dir}/%04d.png", vf="fps=1").run(quiet=True)
+        # Check if frames were extracted
+        frames = sorted([os.path.join(frames_dir, f) for f in os.listdir(frames_dir) if f.endswith('.png')])
+        if not frames:
+            return None, "No frames were extracted from the video"
+        # Limit to a maximum number of frames for reasonable processing times
+        if len(frames) > num_frames:
+            # Take evenly distributed frames
+            indices = [int(i * (len(frames) - 1) / (num_frames - 1)) for i in range(num_frames)]
+            frames = [frames[i] for i in indices]
+        print(f"Processing {len(frames)} frames")
+        # Process frames sequentially if we're using a GPU (to avoid CUDA OOM errors)
+        # Otherwise, use a modest level of parallelism
+        if torch.cuda.is_available():
+            # Sequential processing to avoid CUDA OOM errors
+            processed_frames = []
+            for i, frame in enumerate(frames):
+                success = process_frame(frame, style_prompt)
+                if success:
+                    processed_frames.append(frame)
+                    print(f"Completed frame {os.path.basename(frame)} ({i+1}/{len(frames)})")
+                else:
+                    print(f"Failed to process frame {os.path.basename(frame)}")
+                # Free up CUDA cache between frames
+                torch.cuda.empty_cache()
+        else:
+            # Process frames in parallel with limited workers if no GPU
+            with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+                futures = {executor.submit(process_frame, frame, style_prompt): frame for frame in frames}
+                # Collect results
+                processed_frames = []
+                for future in concurrent.futures.as_completed(futures):
+                    frame = futures[future]
+                    if future.result():
+                        processed_frames.append(frame)
+                        print(f"Completed frame {os.path.basename(frame)} ({len(processed_frames)}/{len(frames)})")
+        if not processed_frames:
+            return None, "Failed to process any frames. Please check the logs for more information."
+        # Even if not all frames were processed, try to create a video with what we have
+        print(f"Successfully processed {len(processed_frames)}/{len(frames)} frames")
+        # Ensure frames are in the correct order (important for video continuity)
+        processed_frames.sort()
+        # Reassemble frames into video
+        output_filename = os.path.join(temp_dir, "stylized.mp4")
+        # Use a higher bitrate and better codec for higher quality
+        ffmpeg.input(f"{frames_dir}/%04d.png", framerate=1) \
+              .output(output_filename, vcodec='libx264', pix_fmt='yuv420p', crf=18) \
+              .run(quiet=True)
+        # Check if the output file exists and has content
+        if not os.path.exists(output_filename) or os.path.getsize(output_filename) == 0:
+            return None, "Failed to create output video"
+        # Copy to a persistent location for Gradio to serve
+        os.makedirs("outputs", exist_ok=True)
+        persistent_output = os.path.join("outputs", f"stylized_{uuid.uuid4()}.mp4")
+        shutil.copy(output_filename, persistent_output)
+        # Return the relative path (Gradio can handle this)
+        print(f"Output video created at: {persistent_output}")
+        # Cleanup temp files
+        shutil.rmtree(temp_dir)
+        return persistent_output, f"Video stylized successfully with {len(processed_frames)} frames!"
+    except Exception as e:
+        import traceback
+        traceback_str = traceback.format_exc()
+        print(f"Error: {str(e)}\n{traceback_str}")
+        return None, f"Error: {str(e)}"
+# Use Gradio examples feature with local files
+example_videos = [
+    ["sample_video.mp4", "Studio Ghibli animation with Hayao Miyazaki's distinctive hand-drawn art style"]
+]
+with gr.Blocks(title="Video-to-Ghibli Style Converter (Open Source)") as iface:
+    gr.Markdown("# Video-to-Ghibli Style Converter (Open Source)")
+    gr.Markdown("Upload a video and convert it to Studio Ghibli animation style using LLaVA and Stable Diffusion.")
+    with gr.Row():
+        with gr.Column(scale=2):
+            # Main input column
+            video_input = gr.Video(label="Upload Video (up to 15 seconds)")
+            style_prompt = gr.Textbox(
+                label="Style Prompt",
+                value="Studio Ghibli animation with Hayao Miyazaki's distinctive hand-drawn art style"
+            )
+            num_frames_slider = gr.Slider(
+                minimum=5,
+                maximum=15,
+                value=10,
+                step=1,
+                label="Number of frames to process"
+            )
+            submit_btn = gr.Button("Stylize Video", variant="primary")
+        with gr.Column(scale=2):
+            # Output column
+            video_output = gr.Video(label="Stylized Video")
+            status_output = gr.Textbox(label="Status", value="Ready. Upload a video to start.")
+    submit_btn.click(
+        fn=stylize_video,
+        inputs=[video_input, style_prompt, num_frames_slider],
+        outputs=[video_output, status_output]
+    )
+    gr.Markdown("""
+    ## Instructions
+    1. Upload a video up to 15 seconds long
+    2. Customize the style prompt if desired
+    3. Adjust the number of frames to process (fewer = faster)
+    4. Click "Stylize Video" and wait for processing
+    ## Example Style Prompts
+    - "Studio Ghibli animation with Hayao Miyazaki's distinctive hand-drawn art style"
+    - "Studio Ghibli style with magical and dreamy atmosphere"
+    - "Nostalgic Studio Ghibli animation style with watercolor backgrounds and clean linework"
+    - "Ghibli-inspired animation with vibrant colors and fantasy elements"
+    Note: Each frame is analyzed by LLaVA-1.5-7B and then transformed by Stable Diffusion (Ghibli-Diffusion model).
+    Videos are processed at 1 frame per second to keep processing time reasonable.
+    ## Technical Details
+    - Image Analysis: Using LLaVA-1.5-7B for frame understanding and description
+    - Image Generation: Using Stable Diffusion (nitrosocke/Ghibli-Diffusion) for style transfer
+    - All processing happens locally - no API keys needed!
+    """)
+if __name__ == "__main__":
+    iface.launch()