File size: 7,392 Bytes
6a5e7fd
 
 
 
c7bd6d5
0d08662
6a5e7fd
1f51e91
1e5c7d2
6a5e7fd
 
 
 
0d08662
 
 
 
 
 
 
fee1e82
 
 
 
0d08662
 
 
 
 
 
 
 
 
 
 
 
 
c7bd6d5
0d08662
 
db8deb9
0d08662
 
 
 
6a5e7fd
0d08662
fd1f618
6a5e7fd
0d08662
 
 
 
e6c8a74
db8deb9
3662f04
0d08662
 
 
6a5e7fd
fd1f618
6a5e7fd
0d08662
6a5e7fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d08662
6a5e7fd
 
86538a3
97a8787
 
 
 
 
fee1e82
 
 
dab5e20
 
97a8787
dab5e20
97a8787
 
dab5e20
97a8787
 
dab5e20
97a8787
0d08662
6a5e7fd
0d08662
86538a3
6a5e7fd
 
 
 
 
 
 
 
 
 
 
 
0d08662
6a5e7fd
 
 
0d08662
 
6a5e7fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d08662
6a5e7fd
 
97a8787
6a5e7fd
0d08662
6a5e7fd
 
 
97a8787
86538a3
6a5e7fd
0d08662
6a5e7fd
0d08662
 
 
 
6a5e7fd
 
 
fd1f618
c11930b
6a5e7fd
 
 
86538a3
0d08662
6a5e7fd
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import gradio as gr
import os
import time
from moviepy.editor import *
import imageio
#from share_btn import community_icon_html, loading_icon_html, share_js

#token = os.environ.get('HF_TOKEN')
caption = gr.Blocks.load(name="spaces/fffiloni/CoCa-clone")
audio_gen = gr.Blocks.load(name="spaces/haoheliu/audioldm-text-to-audio-generation")

ph_message="If you're not happy with sound result, you can manually describe the scene depicted in your image :)"

def extract_video_frames(video_in):

    # Load the video file
    clip = VideoFileClip(video_in)
    
    # Calculate the total duration of the video
    total_duration = clip.duration

    if total_duration > 5:
        clip = clip.subclip(0,5)
        total_duration = clip.duration
    
    # Set the intervals to extract the frames
    intervals = [0, 2, 4, total_duration]
    
    # Initialize the list to store the extracted frames
    frames = []
    
    # Iterate through the intervals and extract the frames
    for i, interval in enumerate(intervals):
        # Get the frame at the given time
        frame = clip.get_frame(interval)

        # Save the frame as JPG image
        imageio.imwrite(f'frame{i}.jpg', frame)
        # Add the frame to the list
        frames.append(f'frame{i}.jpg')
    print(frames)
    return frames
    

def input_changes(input_vid):
    
    if input_vid == None:
        return manual_cap.update(value="",placeholder=ph_message), caption_output.update(value=None), video_output.update(value=None), sound_output.update(value=None)
    else:
        picked_frames = extract_video_frames(input_vid)
        caps = []
        for one_frame in picked_frames:
            cap = caption(one_frame, "Beam search", 1.2, 0.5, 5, 20, fn_index=0)
            caps.append(f"the sound of {cap}")
            print(caps)
        final_cap = '\n then '.join(caps)
        print(final_cap)
        print("CoCa caption: '" + final_cap + "' β€’ ")
        ph_update = "CoCa caption: '" + final_cap + "' β€’ "
        
        return manual_cap.update(value="",placeholder=f"{ph_update}{ph_message}"), caption_output.update(value=final_cap), video_output.update(value=None), sound_output.update(value=None)
 
def infer(video_input, manual_caption, duration_in, seed, caption_output):
    
    print(duration_in)
    if manual_caption == "":
        cap = caption_output
        #cap = caption(image_input, fn_index=0)
        #print("CoCa caption: '" + cap + "' β€’ ")
        #ph_update = "CoCa caption: '" + cap + "' β€’ "
    else:
        cap = manual_caption
        print("manual caption: " + cap)
        ph_update=""
    
    sound = audio_gen(cap, duration_in, 2.5, seed, 3, "audioldm-m-text-ft", fn_index=0)

    print(sound)
  
    # AudioLDM loaded demo returns a video, so we only keep the audio
    video = VideoFileClip(sound)
    audio = video.audio
    audio.write_audiofile("sound.mp3")

    # Then we put the audio to the original video
    
    # Load the input video file
    video_in = VideoFileClip(video_input)
    duration = video_in.duration
    if duration > 5:
        video_in = video_in.subclip(0,5)

    new_audio = AudioFileClip("sound.mp3")
    # Make the audio the same length as the video
    new_audio = new_audio.set_duration(video_in.duration)
    
    # Combine the audio and video
    result = video_in.set_audio(new_audio)
    
    # Save the result
    result.write_videofile("result.mp4", codec='libx264', audio_codec='aac')

    
    #return cap, sound[1], gr.Textbox.update(placeholder=f"{ph_update}{ph_message}"), gr.Group.update(visible=True)
    #return cap, "sound.mp3", gr.Group.update(visible=True)
    return cap, "result.mp4", "sound.mp3"

title = """
    <div style="text-align: center; max-width: 700px; margin: 0 auto;">
        <div
        style="
            display: inline-flex;
            align-items: center;
            gap: 0.8rem;
            font-size: 1.75rem;
        "
        >
        <h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
            Video to Sound Effect
        </h1>
        </div>
        <p style="margin-bottom: 10px; font-size: 94%">
        Convert images from video to a corresponding sound effect generated through CoCa Image Captioning & AudioLDM. <br />
        This demo is experimental and works only with exactly 5 seconds videos.
        </p>
    </div>
"""

article = """
    
    <div class="footer">
        <p>
         
        Follow <a href="https://twitter.com/fffiloni" target="_blank">Sylvain Filoni</a> for future updates πŸ€—
        </p>
    </div>

    <div id="may-like-container" style="display: flex;justify-content: center;flex-direction: column;align-items: center;margin-bottom: 30px;">
        <p>You may also like: </p>
        
        <div id="may-like-content" style="display:flex;flex-wrap: wrap;align-items:center;height:20px;">
            
            <svg height="20" width="208" style="margin-left:4px;margin-bottom: 6px;">       
                 <a href="https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation" target="_blank">
                    <image href="https://img.shields.io/badge/πŸ€— Spaces-AudioLDM_Text_to_Audio-blue" src="https://img.shields.io/badge/πŸ€— Spaces-AudioLDM_Text_to_Audio-blue.png" height="20"/>
                 </a>
            </svg>

            <svg height="20" width="122" style="margin-left:4px;margin-bottom: 6px;">       
                 <a href="https://huggingface.co/spaces/fffiloni/spectrogram-to-music" target="_blank">
                    <image href="https://img.shields.io/badge/πŸ€— Spaces-Riffusion-blue" src="https://img.shields.io/badge/πŸ€— Spaces-Riffusion-blue.png" height="20"/>
                 </a>
            </svg>
            
        </div>
    </div>
"""

with gr.Blocks(css="style.css") as demo:
    with gr.Column(elem_id="col-container"):
        
        gr.HTML(title)
    
        input_vid = gr.Video(source="upload", type="filepath", elem_id="input-vid")
        
        with gr.Column():
            manual_cap = gr.Textbox(label="Manual Video description (optional)", lines=3, placeholder=ph_message)
            with gr.Row():
                duration_in = gr.Slider(interactive=False, minimum=5, maximum=10, step=5, value=5, label="Duration")
                seed_in = gr.Slider(label="Seed", value=440, minimum=45, maximum=10000, step=1)
        
        caption_output = gr.Textbox(label="Caption", visible=False, elem_id="text-caption")
        video_output = gr.Video(label="Result", elem_id="video-output")
        sound_output = gr.Audio()
        #debug = gr.Textbox()
        generate = gr.Button("Generate SFX from Video")

        #with gr.Group(elem_id="share-btn-container", visible=False) as share_group:
        #    community_icon = gr.HTML(community_icon_html)
        #    loading_icon = gr.HTML(loading_icon_html)
        #    share_button = gr.Button("Share to community", elem_id="share-btn")

        gr.HTML(article)

    change_out = [manual_cap, caption_output, video_output, sound_output]
    input_vid.change(input_changes, input_vid, change_out, queue=False)
    

    
    generate.click(infer, inputs=[input_vid, manual_cap, duration_in, seed_in, caption_output], outputs=[caption_output, video_output, sound_output], api_name="v2fx")
    #share_button.click(None, [], [], _js=share_js)

demo.queue(max_size=32).launch(debug=True)