File size: 7,392 Bytes
6a5e7fd c7bd6d5 0d08662 6a5e7fd 1f51e91 1e5c7d2 6a5e7fd 0d08662 fee1e82 0d08662 c7bd6d5 0d08662 db8deb9 0d08662 6a5e7fd 0d08662 fd1f618 6a5e7fd 0d08662 e6c8a74 db8deb9 3662f04 0d08662 6a5e7fd fd1f618 6a5e7fd 0d08662 6a5e7fd 0d08662 6a5e7fd 86538a3 97a8787 fee1e82 dab5e20 97a8787 dab5e20 97a8787 dab5e20 97a8787 dab5e20 97a8787 0d08662 6a5e7fd 0d08662 86538a3 6a5e7fd 0d08662 6a5e7fd 0d08662 6a5e7fd 0d08662 6a5e7fd 97a8787 6a5e7fd 0d08662 6a5e7fd 97a8787 86538a3 6a5e7fd 0d08662 6a5e7fd 0d08662 6a5e7fd fd1f618 c11930b 6a5e7fd 86538a3 0d08662 6a5e7fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import gradio as gr
import os
import time
from moviepy.editor import *
import imageio
#from share_btn import community_icon_html, loading_icon_html, share_js
#token = os.environ.get('HF_TOKEN')
caption = gr.Blocks.load(name="spaces/fffiloni/CoCa-clone")
audio_gen = gr.Blocks.load(name="spaces/haoheliu/audioldm-text-to-audio-generation")
ph_message="If you're not happy with sound result, you can manually describe the scene depicted in your image :)"
def extract_video_frames(video_in):
# Load the video file
clip = VideoFileClip(video_in)
# Calculate the total duration of the video
total_duration = clip.duration
if total_duration > 5:
clip = clip.subclip(0,5)
total_duration = clip.duration
# Set the intervals to extract the frames
intervals = [0, 2, 4, total_duration]
# Initialize the list to store the extracted frames
frames = []
# Iterate through the intervals and extract the frames
for i, interval in enumerate(intervals):
# Get the frame at the given time
frame = clip.get_frame(interval)
# Save the frame as JPG image
imageio.imwrite(f'frame{i}.jpg', frame)
# Add the frame to the list
frames.append(f'frame{i}.jpg')
print(frames)
return frames
def input_changes(input_vid):
if input_vid == None:
return manual_cap.update(value="",placeholder=ph_message), caption_output.update(value=None), video_output.update(value=None), sound_output.update(value=None)
else:
picked_frames = extract_video_frames(input_vid)
caps = []
for one_frame in picked_frames:
cap = caption(one_frame, "Beam search", 1.2, 0.5, 5, 20, fn_index=0)
caps.append(f"the sound of {cap}")
print(caps)
final_cap = '\n then '.join(caps)
print(final_cap)
print("CoCa caption: '" + final_cap + "' β’ ")
ph_update = "CoCa caption: '" + final_cap + "' β’ "
return manual_cap.update(value="",placeholder=f"{ph_update}{ph_message}"), caption_output.update(value=final_cap), video_output.update(value=None), sound_output.update(value=None)
def infer(video_input, manual_caption, duration_in, seed, caption_output):
print(duration_in)
if manual_caption == "":
cap = caption_output
#cap = caption(image_input, fn_index=0)
#print("CoCa caption: '" + cap + "' β’ ")
#ph_update = "CoCa caption: '" + cap + "' β’ "
else:
cap = manual_caption
print("manual caption: " + cap)
ph_update=""
sound = audio_gen(cap, duration_in, 2.5, seed, 3, "audioldm-m-text-ft", fn_index=0)
print(sound)
# AudioLDM loaded demo returns a video, so we only keep the audio
video = VideoFileClip(sound)
audio = video.audio
audio.write_audiofile("sound.mp3")
# Then we put the audio to the original video
# Load the input video file
video_in = VideoFileClip(video_input)
duration = video_in.duration
if duration > 5:
video_in = video_in.subclip(0,5)
new_audio = AudioFileClip("sound.mp3")
# Make the audio the same length as the video
new_audio = new_audio.set_duration(video_in.duration)
# Combine the audio and video
result = video_in.set_audio(new_audio)
# Save the result
result.write_videofile("result.mp4", codec='libx264', audio_codec='aac')
#return cap, sound[1], gr.Textbox.update(placeholder=f"{ph_update}{ph_message}"), gr.Group.update(visible=True)
#return cap, "sound.mp3", gr.Group.update(visible=True)
return cap, "result.mp4", "sound.mp3"
title = """
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
<div
style="
display: inline-flex;
align-items: center;
gap: 0.8rem;
font-size: 1.75rem;
"
>
<h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
Video to Sound Effect
</h1>
</div>
<p style="margin-bottom: 10px; font-size: 94%">
Convert images from video to a corresponding sound effect generated through CoCa Image Captioning & AudioLDM. <br />
This demo is experimental and works only with exactly 5 seconds videos.
</p>
</div>
"""
article = """
<div class="footer">
<p>
Follow <a href="https://twitter.com/fffiloni" target="_blank">Sylvain Filoni</a> for future updates π€
</p>
</div>
<div id="may-like-container" style="display: flex;justify-content: center;flex-direction: column;align-items: center;margin-bottom: 30px;">
<p>You may also like: </p>
<div id="may-like-content" style="display:flex;flex-wrap: wrap;align-items:center;height:20px;">
<svg height="20" width="208" style="margin-left:4px;margin-bottom: 6px;">
<a href="https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation" target="_blank">
<image href="https://img.shields.io/badge/π€ Spaces-AudioLDM_Text_to_Audio-blue" src="https://img.shields.io/badge/π€ Spaces-AudioLDM_Text_to_Audio-blue.png" height="20"/>
</a>
</svg>
<svg height="20" width="122" style="margin-left:4px;margin-bottom: 6px;">
<a href="https://huggingface.co/spaces/fffiloni/spectrogram-to-music" target="_blank">
<image href="https://img.shields.io/badge/π€ Spaces-Riffusion-blue" src="https://img.shields.io/badge/π€ Spaces-Riffusion-blue.png" height="20"/>
</a>
</svg>
</div>
</div>
"""
with gr.Blocks(css="style.css") as demo:
with gr.Column(elem_id="col-container"):
gr.HTML(title)
input_vid = gr.Video(source="upload", type="filepath", elem_id="input-vid")
with gr.Column():
manual_cap = gr.Textbox(label="Manual Video description (optional)", lines=3, placeholder=ph_message)
with gr.Row():
duration_in = gr.Slider(interactive=False, minimum=5, maximum=10, step=5, value=5, label="Duration")
seed_in = gr.Slider(label="Seed", value=440, minimum=45, maximum=10000, step=1)
caption_output = gr.Textbox(label="Caption", visible=False, elem_id="text-caption")
video_output = gr.Video(label="Result", elem_id="video-output")
sound_output = gr.Audio()
#debug = gr.Textbox()
generate = gr.Button("Generate SFX from Video")
#with gr.Group(elem_id="share-btn-container", visible=False) as share_group:
# community_icon = gr.HTML(community_icon_html)
# loading_icon = gr.HTML(loading_icon_html)
# share_button = gr.Button("Share to community", elem_id="share-btn")
gr.HTML(article)
change_out = [manual_cap, caption_output, video_output, sound_output]
input_vid.change(input_changes, input_vid, change_out, queue=False)
generate.click(infer, inputs=[input_vid, manual_cap, duration_in, seed_in, caption_output], outputs=[caption_output, video_output, sound_output], api_name="v2fx")
#share_button.click(None, [], [], _js=share_js)
demo.queue(max_size=32).launch(debug=True)
|