video-to-sound-fx

Paused

File size: 7,392 Bytes

6a5e7fd
 
 
 
c7bd6d5
0d08662
6a5e7fd
1f51e91
1e5c7d2
6a5e7fd
 
 
 
0d08662
 
 
 
 
 
 
fee1e82
 
 
 
0d08662
 
 
 
 
 
 
 
 
 
 
 
 
c7bd6d5
0d08662
 
db8deb9
0d08662
 
 
 
6a5e7fd
0d08662
fd1f618
6a5e7fd
0d08662
 
 
 
e6c8a74
db8deb9
3662f04
0d08662
 
 
6a5e7fd
fd1f618
6a5e7fd
0d08662
6a5e7fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d08662
6a5e7fd
 
86538a3
97a8787
 
 
 
 
fee1e82
 
 
dab5e20
 
97a8787
dab5e20
97a8787
 
dab5e20
97a8787
 
dab5e20
97a8787
0d08662
6a5e7fd
0d08662
86538a3
6a5e7fd
 
 
 
 
 
 
 
 
 
 
 
0d08662
6a5e7fd
 
 
0d08662
 
6a5e7fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d08662
6a5e7fd
 
97a8787
6a5e7fd
0d08662
6a5e7fd
 
 
97a8787
86538a3
6a5e7fd
0d08662
6a5e7fd
0d08662
 
 
 
6a5e7fd
 
 
fd1f618
c11930b
6a5e7fd
 
 
86538a3
0d08662
6a5e7fd

import gradio as gr
import os
import time
from moviepy.editor import *
import imageio
#from share_btn import community_icon_html, loading_icon_html, share_js

#token = os.environ.get('HF_TOKEN')
caption = gr.Blocks.load(name="spaces/fffiloni/CoCa-clone")
audio_gen = gr.Blocks.load(name="spaces/haoheliu/audioldm-text-to-audio-generation")

ph_message="If you're not happy with sound result, you can manually describe the scene depicted in your image :)"

def extract_video_frames(video_in):

    # Load the video file
    clip = VideoFileClip(video_in)
    
    # Calculate the total duration of the video
    total_duration = clip.duration

    if total_duration > 5:
        clip = clip.subclip(0,5)
        total_duration = clip.duration
    
    # Set the intervals to extract the frames
    intervals = [0, 2, 4, total_duration]
    
    # Initialize the list to store the extracted frames
    frames = []
    
    # Iterate through the intervals and extract the frames
    for i, interval in enumerate(intervals):
        # Get the frame at the given time
        frame = clip.get_frame(interval)

        # Save the frame as JPG image
        imageio.imwrite(f'frame{i}.jpg', frame)
        # Add the frame to the list
        frames.append(f'frame{i}.jpg')
    print(frames)
    return frames
    

def input_changes(input_vid):
    
    if input_vid == None:
        return manual_cap.update(value="",placeholder=ph_message), caption_output.update(value=None), video_output.update(value=None), sound_output.update(value=None)
    else:
        picked_frames = extract_video_frames(input_vid)
        caps = []
        for one_frame in picked_frames:
            cap = caption(one_frame, "Beam search", 1.2, 0.5, 5, 20, fn_index=0)
            caps.append(f"the sound of {cap}")
            print(caps)
        final_cap = '\n then '.join(caps)
        print(final_cap)
        print("CoCa caption: '" + final_cap + "' • ")
        ph_update = "CoCa caption: '" + final_cap + "' • "
        
        return manual_cap.update(value="",placeholder=f"{ph_update}{ph_message}"), caption_output.update(value=final_cap), video_output.update(value=None), sound_output.update(value=None)
 
def infer(video_input, manual_caption, duration_in, seed, caption_output):
    
    print(duration_in)
    if manual_caption == "":
        cap = caption_output
        #cap = caption(image_input, fn_index=0)
        #print("CoCa caption: '" + cap + "' • ")
        #ph_update = "CoCa caption: '" + cap + "' • "
    else:
        cap = manual_caption
        print("manual caption: " + cap)
        ph_update=""
    
    sound = audio_gen(cap, duration_in, 2.5, seed, 3, "audioldm-m-text-ft", fn_index=0)

    print(sound)
  
    # AudioLDM loaded demo returns a video, so we only keep the audio
    video = VideoFileClip(sound)
    audio = video.audio
    audio.write_audiofile("sound.mp3")

    # Then we put the audio to the original video
    
    # Load the input video file
    video_in = VideoFileClip(video_input)
    duration = video_in.duration
    if duration > 5:
        video_in = video_in.subclip(0,5)

    new_audio = AudioFileClip("sound.mp3")
    # Make the audio the same length as the video
    new_audio = new_audio.set_duration(video_in.duration)
    
    # Combine the audio and video
    result = video_in.set_audio(new_audio)
    
    # Save the result
    result.write_videofile("result.mp4", codec='libx264', audio_codec='aac')

    
    #return cap, sound[1], gr.Textbox.update(placeholder=f"{ph_update}{ph_message}"), gr.Group.update(visible=True)
    #return cap, "sound.mp3", gr.Group.update(visible=True)
    return cap, "result.mp4", "sound.mp3"

title = """
    <div style="text-align: center; max-width: 700px; margin: 0 auto;">
        <div
        style="
            display: inline-flex;
            align-items: center;
            gap: 0.8rem;
            font-size: 1.75rem;
        "
        >
        <h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
            Video to Sound Effect
        </h1>
        </div>
        <p style="margin-bottom: 10px; font-size: 94%">
        Convert images from video to a corresponding sound effect generated through CoCa Image Captioning & AudioLDM. <br />
        This demo is experimental and works only with exactly 5 seconds videos.
        </p>
    </div>
"""

article = """
    
    <div class="footer">
        <p>
         
        Follow <a href="https://twitter.com/fffiloni" target="_blank">Sylvain Filoni</a> for future updates 🤗
        </p>
    </div>

    <div id="may-like-container" style="display: flex;justify-content: center;flex-direction: column;align-items: center;margin-bottom: 30px;">
        <p>You may also like: </p>
        
        <div id="may-like-content" style="display:flex;flex-wrap: wrap;align-items:center;height:20px;">
            
            <svg height="20" width="208" style="margin-left:4px;margin-bottom: 6px;">       
                 <a href="https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation" target="_blank">
                    <image href="https://img.shields.io/badge/🤗 Spaces-AudioLDM_Text_to_Audio-blue" src="https://img.shields.io/badge/🤗 Spaces-AudioLDM_Text_to_Audio-blue.png" height="20"/>
                 </a>
            </svg>

            <svg height="20" width="122" style="margin-left:4px;margin-bottom: 6px;">       
                 <a href="https://huggingface.co/spaces/fffiloni/spectrogram-to-music" target="_blank">
                    <image href="https://img.shields.io/badge/🤗 Spaces-Riffusion-blue" src="https://img.shields.io/badge/🤗 Spaces-Riffusion-blue.png" height="20"/>
                 </a>
            </svg>
            
        </div>
    </div>
"""

with gr.Blocks(css="style.css") as demo:
    with gr.Column(elem_id="col-container"):
        
        gr.HTML(title)
    
        input_vid = gr.Video(source="upload", type="filepath", elem_id="input-vid")
        
        with gr.Column():
            manual_cap = gr.Textbox(label="Manual Video description (optional)", lines=3, placeholder=ph_message)
            with gr.Row():
                duration_in = gr.Slider(interactive=False, minimum=5, maximum=10, step=5, value=5, label="Duration")
                seed_in = gr.Slider(label="Seed", value=440, minimum=45, maximum=10000, step=1)
        
        caption_output = gr.Textbox(label="Caption", visible=False, elem_id="text-caption")
        video_output = gr.Video(label="Result", elem_id="video-output")
        sound_output = gr.Audio()
        #debug = gr.Textbox()
        generate = gr.Button("Generate SFX from Video")

        #with gr.Group(elem_id="share-btn-container", visible=False) as share_group:
        #    community_icon = gr.HTML(community_icon_html)
        #    loading_icon = gr.HTML(loading_icon_html)
        #    share_button = gr.Button("Share to community", elem_id="share-btn")

        gr.HTML(article)

    change_out = [manual_cap, caption_output, video_output, sound_output]
    input_vid.change(input_changes, input_vid, change_out, queue=False)
    

    
    generate.click(infer, inputs=[input_vid, manual_cap, duration_in, seed_in, caption_output], outputs=[caption_output, video_output, sound_output], api_name="v2fx")
    #share_button.click(None, [], [], _js=share_js)

demo.queue(max_size=32).launch(debug=True)