JoannaKOKO commited on
Commit
9bc975a
·
verified ·
1 Parent(s): 6da7857

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -55
app.py CHANGED
@@ -4,11 +4,10 @@ import torch
4
  from PIL import Image
5
  import io
6
  import numpy as np
7
- from kokoro import KPipeline # for text-to-speech
8
- from IPython.display import display, Audio
9
- import soundfile as sf
10
 
11
- # Load models
12
  # Image-to-Text model
13
  processor = AutoProcessor.from_pretrained("Ertugrul/Qwen2-VL-7B-Captioner-Relaxed")
14
  caption_model = AutoModelForCausalLM.from_pretrained("Ertugrul/Qwen2-VL-7B-Captioner-Relaxed")
@@ -16,55 +15,46 @@ caption_model = AutoModelForCausalLM.from_pretrained("Ertugrul/Qwen2-VL-7B-Capti
16
  # Text-to-Story model
17
  story_generator = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")
18
 
19
- # Load the text-to-speech model
 
20
 
21
- for i, (gs, ps, audio) in enumerate(audio_generator):
22
- print(i) # i => index
23
- print(gs) # gs => graphemes/text
24
- print(ps) # ps => phonemes
25
- display(Audio(data=audio, rate=24000, autoplay=i==0))
26
- sf.write(f'{i}.wav', audio, 24000) # save each audio file
 
27
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- def generate_text(image_bytes):
30
- # load image-to-text model
31
- processor = AutoProcessor.from_pretrained("Ertugrul/Qwen2-VL-7B-Captioner-Relaxed")
32
- caption_model = AutoModelForCausalLM.from_pretrained("Ertugrul/Qwen2-VL-7B-Captioner-Relaxed")
33
-
34
- # Convert bytes to PIL Image
35
- image = Image.open(io.BytesIO(image_bytes))
36
-
37
- # Step 1: Generate text from image
38
- inputs = processor(images=image, text="Generate a caption:", return_tensors="pt")
39
- outputs = caption_model.generate(**inputs)
40
- text = processor.decode(outputs[0], skip_special_tokens=True)
41
- return text
42
-
43
- def generate_story(text):
44
- # load text-to-story model
45
- story_generator = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")
46
-
47
- # Step 2: Generate story from caption
48
- prompt = f"Based on the description '{text}', tell a short story for children aged 3 to 10 in no more than 100 words."
49
- story_output = story_generator(prompt, max_length=150, num_return_sequences=1)
50
- story = story_output[0]["generated_text"]
51
- return story
52
-
53
-
54
- def generate_audio(story):
55
- audio_pipeline = KPipeline(lang_code='a')
56
- audio_generator = audio_pipeline(
57
- story, voice='af_heart', # <= change voice here
58
- speed=1, split_pattern=r'\n+'
59
- )
60
- for i, (gs, ps, audio) in enumerate(audio_generator):
61
- print(i) # i => index
62
- print(gs) # gs => graphemes/text
63
- print(ps) # ps => phonemes
64
- display(Audio(data=audio, rate=24000, autoplay=i==0))
65
- sf.write(f'{i}.wav', audio, 24000) # save each audio file
66
-
67
-
68
 
69
  # Streamlit UI
70
  st.title("Image to Story Audio Generator")
@@ -76,8 +66,28 @@ if uploaded_file is not None:
76
  image_bytes = uploaded_file.read()
77
  st.image(image_bytes, caption="Uploaded Image", use_column_width=True)
78
 
79
- with st.spinner("Generating story audio..."):
80
- #audio, sample_rate = generate_story_audio(image_bytes)
81
- text = generate_text(image_bytes)
82
- story = generate_story(text)
83
- generate_audio(story)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from PIL import Image
5
  import io
6
  import numpy as np
7
+ from kokoro import KPipeline # For text-to-speech
8
+ import soundfile as sf
 
9
 
10
+ # Load models globally to avoid reloading them repeatedly
11
  # Image-to-Text model
12
  processor = AutoProcessor.from_pretrained("Ertugrul/Qwen2-VL-7B-Captioner-Relaxed")
13
  caption_model = AutoModelForCausalLM.from_pretrained("Ertugrul/Qwen2-VL-7B-Captioner-Relaxed")
 
15
  # Text-to-Story model
16
  story_generator = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")
17
 
18
+ # Text-to-Speech model
19
+ audio_pipeline = KPipeline(lang_code='a')
20
 
21
+ # Function to generate a caption from an image
22
+ def generate_caption(image_bytes):
23
+ image = Image.open(io.BytesIO(image_bytes))
24
+ inputs = processor(images=image, text="Generate a caption:", return_tensors="pt")
25
+ outputs = caption_model.generate(**inputs)
26
+ caption = processor.decode(outputs[0], skip_special_tokens=True)
27
+ return caption
28
 
29
+ # Function to generate a story from a caption
30
+ def generate_story(caption):
31
+ prompt = f"Based on the description '{caption}', tell a short story for children aged 3 to 10 in no more than 100 words."
32
+ story_output = story_generator(prompt, max_length=150, num_return_sequences=1)
33
+ story = story_output[0]["generated_text"]
34
+ # Truncate to 100 words if necessary
35
+ story_words = story.split()
36
+ if len(story_words) > 100:
37
+ story = " ".join(story_words[:100])
38
+ return story
39
 
40
+ # Function to generate audio from a story
41
+ def generate_audio(story):
42
+ audio_generator = audio_pipeline(
43
+ story, voice='af_heart', speed=1, split_pattern=r'\n+'
44
+ )
45
+ audio_segments = []
46
+ # Collect all audio segments
47
+ for i, (gs, ps, audio) in enumerate(audio_generator):
48
+ audio_segments.append(audio)
49
+ if not audio_segments:
50
+ return None
51
+ # Concatenate audio segments into a single array
52
+ concatenated_audio = np.concatenate(audio_segments)
53
+ # Write to a BytesIO buffer instead of saving to disk
54
+ audio_buffer = io.BytesIO()
55
+ sf.write(audio_buffer, concatenated_audio, 24000, format='WAV')
56
+ audio_buffer.seek(0)
57
+ return audio_buffer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  # Streamlit UI
60
  st.title("Image to Story Audio Generator")
 
66
  image_bytes = uploaded_file.read()
67
  st.image(image_bytes, caption="Uploaded Image", use_column_width=True)
68
 
69
+ # Generate and display caption
70
+ with st.spinner("Generating caption..."):
71
+ caption = generate_caption(image_bytes)
72
+ st.write("**Generated Caption:**")
73
+ st.write(caption)
74
+
75
+ # Generate and display story
76
+ with st.spinner("Generating story..."):
77
+ story = generate_story(caption)
78
+ st.write("**Generated Story:**")
79
+ st.write(story)
80
+
81
+ # Generate and display audio
82
+ with st.spinner("Generating audio..."):
83
+ audio_buffer = generate_audio(story)
84
+ if audio_buffer:
85
+ st.audio(audio_buffer, format="audio/wav")
86
+ st.download_button(
87
+ label="Download Story Audio",
88
+ data=audio_buffer,
89
+ file_name="story_audio.wav",
90
+ mime="audio/wav"
91
+ )
92
+ else:
93
+ st.error("Failed to generate audio.")