JoannaKOKO commited on
Commit
20713d5
·
verified ·
1 Parent(s): 808ca72

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -59
app.py CHANGED
@@ -1,60 +1,63 @@
1
  import streamlit as st
2
- from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline
3
  import torch
4
  from PIL import Image
5
  import io
6
  import numpy as np
7
  from kokoro import KPipeline # For text-to-speech
8
- #import soundfile as sf
9
 
10
  # Load models globally to avoid reloading them repeatedly
11
- # Image-to-Text model
12
- processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
13
- caption_model = AutoModelForImageTextToText.from_pretrained("Salesforce/blip-image-captioning-large")
14
-
15
- # Text-to-Story model
16
  story_generator = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")
17
-
18
- # Text-to-Speech model
19
- audio_pipeline = KPipeline(lang_code='a')
20
 
21
  # Function to generate a caption from an image
22
  def generate_caption(image_bytes):
23
- image = Image.open(io.BytesIO(image_bytes))
24
- inputs = processor(images=image, text="Generate a caption:", return_tensors="pt")
25
- outputs = caption_model.generate(**inputs)
26
- caption = processor.decode(outputs[0], skip_special_tokens=True)
27
- return caption
 
 
 
 
28
 
29
  # Function to generate a story from a caption
30
  def generate_story(caption):
31
- prompt = f"Based on the description '{caption}', tell a short story for children aged 3 to 10 in no more than 100 words."
32
- story_output = story_generator(prompt, max_length=150, num_return_sequences=1)
33
- story = story_output[0]["generated_text"]
34
- # Truncate to 100 words if necessary
35
- story_words = story.split()
36
- if len(story_words) > 100:
37
- story = " ".join(story_words[:100])
38
- return story
 
 
 
39
 
40
  # Function to generate audio from a story
41
  def generate_audio(story):
42
- audio_generator = audio_pipeline(
43
- story, voice='af_heart', speed=1, split_pattern=r'\n+'
44
- )
45
- audio_segments = []
46
- # Collect all audio segments
47
- for i, (gs, ps, audio) in enumerate(audio_generator):
48
- audio_segments.append(audio)
49
- if not audio_segments:
 
 
 
 
 
 
 
 
50
  return None
51
- # Concatenate audio segments into a single array
52
- concatenated_audio = np.concatenate(audio_segments)
53
- # Write to a BytesIO buffer instead of saving to disk
54
- audio_buffer = io.BytesIO()
55
- sf.write(audio_buffer, concatenated_audio, 24000, format='WAV')
56
- audio_buffer.seek(0)
57
- return audio_buffer
58
 
59
  # Streamlit UI
60
  st.title("Image to Story Audio Generator")
@@ -66,28 +69,25 @@ if uploaded_file is not None:
66
  image_bytes = uploaded_file.read()
67
  st.image(image_bytes, caption="Uploaded Image", use_column_width=True)
68
 
69
- # Generate and display caption
70
  with st.spinner("Generating caption..."):
71
  caption = generate_caption(image_bytes)
72
- st.write("**Generated Caption:**")
73
- st.write(caption)
 
74
 
75
- # Generate and display story
76
- with st.spinner("Generating story..."):
77
- story = generate_story(caption)
78
- st.write("**Generated Story:**")
79
- st.write(story)
80
 
81
- # Generate and display audio
82
- with st.spinner("Generating audio..."):
83
- audio_buffer = generate_audio(story)
84
- if audio_buffer:
85
- st.audio(audio_buffer, format="audio/wav")
86
- st.download_button(
87
- label="Download Story Audio",
88
- data=audio_buffer,
89
- file_name="story_audio.wav",
90
- mime="audio/wav"
91
- )
92
- else:
93
- st.error("Failed to generate audio.")
 
1
  import streamlit as st
2
+ from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline
3
  import torch
4
  from PIL import Image
5
  import io
6
  import numpy as np
7
  from kokoro import KPipeline # For text-to-speech
8
+ import soundfile as sf
9
 
10
  # Load models globally to avoid reloading them repeatedly
11
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
12
+ caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
 
 
 
13
  story_generator = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")
14
+ audio_pipeline = KPipeline(lang_code='a') # Assuming 'en' for English
 
 
15
 
16
  # Function to generate a caption from an image
17
  def generate_caption(image_bytes):
18
+ try:
19
+ image = Image.open(io.BytesIO(image_bytes))
20
+ inputs = processor(images=image, return_tensors="pt")
21
+ outputs = caption_model.generate(**inputs)
22
+ caption = processor.decode(outputs[0], skip_special_tokens=True)
23
+ return caption
24
+ except Exception as e:
25
+ st.error(f"Error generating caption: {e}")
26
+ return None
27
 
28
  # Function to generate a story from a caption
29
  def generate_story(caption):
30
+ try:
31
+ prompt = f"Based on the description '{caption}', tell a short story for children aged 3 to 10 in no more than 100 words."
32
+ story_output = story_generator(prompt, max_length=150, num_return_sequences=1)
33
+ story = story_output[0]["generated_text"]
34
+ story_words = story.split()
35
+ if len(story_words) > 100:
36
+ story = " ".join(story_words[:100])
37
+ return story
38
+ except Exception as e:
39
+ st.error(f"Error generating story: {e}")
40
+ return None
41
 
42
  # Function to generate audio from a story
43
  def generate_audio(story):
44
+ try:
45
+ audio_generator = audio_pipeline(
46
+ story, voice='af_heart', speed=1
47
+ )
48
+ audio_segments = []
49
+ for i, (gs, ps, audio) in enumerate(audio_generator):
50
+ audio_segments.append(audio)
51
+ if not audio_segments:
52
+ return None
53
+ concatenated_audio = np.concatenate(audio_segments)
54
+ audio_buffer = io.BytesIO()
55
+ sf.write(audio_buffer, concatenated_audio, 24000, format='WAV')
56
+ audio_buffer.seek(0)
57
+ return audio_buffer
58
+ except Exception as e:
59
+ st.error(f"Error generating audio: {e}")
60
  return None
 
 
 
 
 
 
 
61
 
62
  # Streamlit UI
63
  st.title("Image to Story Audio Generator")
 
69
  image_bytes = uploaded_file.read()
70
  st.image(image_bytes, caption="Uploaded Image", use_column_width=True)
71
 
 
72
  with st.spinner("Generating caption..."):
73
  caption = generate_caption(image_bytes)
74
+ if caption:
75
+ st.write("**Generated Caption:**")
76
+ st.write(caption)
77
 
78
+ with st.spinner("Generating story..."):
79
+ story = generate_story(caption)
80
+ if story:
81
+ st.write("**Generated Story:**")
82
+ st.write(story)
83
 
84
+ with st.spinner("Generating audio..."):
85
+ audio_buffer = generate_audio(story)
86
+ if audio_buffer:
87
+ st.audio(audio_buffer, format="audio/wav")
88
+ st.download_button(
89
+ label="Download Story Audio",
90
+ data=audio_buffer,
91
+ file_name="story_audio.wav",
92
+ mime="audio/wav"
93
+ )