Spaces:
Sleeping
Sleeping
import gradio as gr | |
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler | |
from transformers import AutoProcessor, AutoModelForVision2Seq, AutoModelForCausalLM, AutoTokenizer | |
import torch | |
from PIL import Image, ImageDraw, ImageFont | |
import numpy as np | |
import textwrap | |
import os | |
import gc | |
import re | |
import psutil | |
from datetime import datetime | |
import spaces | |
from kokoro import KPipeline | |
import soundfile as sf | |
def clear_memory(): | |
"""Helper function to clear both CUDA and system memory, safe for Spaces environment""" | |
gc.collect() | |
# Only perform CUDA operations if we're in a GPU task context | |
if hasattr(spaces, "current_task") and spaces.current_task and torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
torch.cuda.synchronize() | |
process = psutil.Process(os.getpid()) | |
if hasattr(process, 'memory_info'): | |
process.memory_info().rss | |
gc.collect(generation=0) | |
gc.collect(generation=1) | |
gc.collect(generation=2) | |
# Only log GPU stats if we're in a GPU task context | |
if hasattr(spaces, "current_task") and spaces.current_task and torch.cuda.is_available(): | |
print(f"GPU Memory allocated: {torch.cuda.memory_allocated()/1024**2:.2f} MB") | |
print(f"GPU Memory cached: {torch.cuda.memory_reserved()/1024**2:.2f} MB") | |
print(f"CPU RAM used: {process.memory_info().rss/1024**2:.2f} MB") | |
# Initialize models at startup - only the lightweight ones | |
print("Loading models...") | |
# Load SmolVLM for image analysis | |
processor_vlm = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-500M-Instruct") | |
model_vlm = AutoModelForVision2Seq.from_pretrained( | |
"HuggingFaceTB/SmolVLM-500M-Instruct", | |
torch_dtype=torch.bfloat16 | |
).to("cuda") | |
# Load SmolLM2 for story and prompt generation | |
checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct" | |
tokenizer_lm = AutoTokenizer.from_pretrained(checkpoint) | |
model_lm = AutoModelForCausalLM.from_pretrained(checkpoint).to("cuda") | |
# Initialize Kokoro TTS pipeline | |
pipeline = KPipeline(lang_code='a') # 'a' for American English | |
def load_sd_model(): | |
"""Load Stable Diffusion model only when needed""" | |
pipe = StableDiffusionPipeline.from_pretrained( | |
"runwayml/stable-diffusion-v1-5", | |
torch_dtype=torch.float16, | |
) | |
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) | |
pipe.to("cuda") | |
pipe.enable_attention_slicing() | |
return pipe | |
def generate_image(): | |
"""Generate a random landscape image.""" | |
clear_memory() | |
pipe = load_sd_model() | |
default_prompt = "a beautiful, professional landscape photograph" | |
default_negative_prompt = "blurry, bad quality, distorted, deformed" | |
default_steps = 30 | |
default_guidance = 7.5 | |
default_seed = torch.randint(0, 2**32 - 1, (1,)).item() | |
generator = torch.Generator("cuda").manual_seed(default_seed) | |
try: | |
image = pipe( | |
prompt=default_prompt, | |
negative_prompt=default_negative_prompt, | |
num_inference_steps=default_steps, | |
guidance_scale=default_guidance, | |
generator=generator, | |
).images[0] | |
del pipe | |
clear_memory() | |
return image | |
except Exception as e: | |
print(f"Error generating image: {e}") | |
if 'pipe' in locals(): | |
del pipe | |
clear_memory() | |
return None | |
def analyze_image(image): | |
if image is None: | |
return "Please generate an image first." | |
clear_memory() | |
if isinstance(image, np.ndarray): | |
image = Image.fromarray(image) | |
messages = [ | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "image"}, | |
{"type": "text", "text": "Describe this image and Be brief but descriptive."} | |
] | |
} | |
] | |
try: | |
prompt = processor_vlm.apply_chat_template(messages, add_generation_prompt=True) | |
inputs = processor_vlm( | |
text=prompt, | |
images=[image], | |
return_tensors="pt" | |
).to('cuda') | |
outputs = model_vlm.generate( | |
input_ids=inputs.input_ids, | |
pixel_values=inputs.pixel_values, | |
attention_mask=inputs.attention_mask, | |
num_return_sequences=1, | |
no_repeat_ngram_size=2, | |
max_new_tokens=500, | |
min_new_tokens=10 | |
) | |
description = processor_vlm.decode(outputs[0], skip_special_tokens=True) | |
description = re.sub(r".*?Assistant:\s*", "", description, flags=re.DOTALL).strip() | |
# Split into sentences and take only the first three | |
sentences = re.split(r'(?<=[.!?])\s+', description) | |
description = ' '.join(sentences[:3]) | |
clear_memory() | |
return description | |
except Exception as e: | |
print(f"Error analyzing image: {e}") | |
clear_memory() | |
return "Error analyzing image. Please try again." | |
def generate_story(image_description): | |
clear_memory() | |
story_prompt = f"""Write a short children's story (one chapter, about 500 words) based on this scene: {image_description} | |
Requirements: | |
1. Main character: An English bulldog named Champ | |
2. Include these values: confidence, teamwork, caring, and hope | |
3. Theme: "We are stronger together than as individuals" | |
4. Keep it simple and engaging for young children | |
5. End with a simple moral lesson""" | |
try: | |
messages = [{"role": "user", "content": story_prompt}] | |
input_text = tokenizer_lm.apply_chat_template(messages, tokenize=False) | |
inputs = tokenizer_lm.encode(input_text, return_tensors="pt").to("cuda") | |
outputs = model_lm.generate( | |
inputs, | |
max_new_tokens=750, | |
temperature=0.7, | |
top_p=0.9, | |
do_sample=True, | |
repetition_penalty=1.2 | |
) | |
story = tokenizer_lm.decode(outputs[0]) | |
story = clean_story_output(story) | |
clear_memory() | |
return story | |
except Exception as e: | |
print(f"Error generating story: {e}") | |
clear_memory() | |
return "Error generating story. Please try again." | |
def generate_image_prompts(story_text): | |
clear_memory() | |
paragraphs = split_into_paragraphs(story_text) | |
all_prompts = [] | |
prompt_instruction = '''Here is a story paragraph: {paragraph} | |
Start your response with "Watercolor bulldog" and describe what Champ is doing in this scene. Add where it takes place and one mood detail. Keep it short.''' | |
try: | |
for i, paragraph in enumerate(paragraphs, 1): | |
messages = [{"role": "user", "content": prompt_instruction.format(paragraph=paragraph)}] | |
input_text = tokenizer_lm.apply_chat_template(messages, tokenize=False) | |
inputs = tokenizer_lm.encode(input_text, return_tensors="pt").to("cuda") | |
outputs = model_lm.generate( | |
inputs, | |
max_new_tokens=30, | |
temperature=0.5, | |
top_p=0.9, | |
do_sample=True, | |
repetition_penalty=1.2 | |
) | |
prompt = process_generated_prompt(tokenizer_lm.decode(outputs[0]), paragraph) | |
section = f"Paragraph {i}:\n{paragraph}\n\nScenery Prompt {i}:\n{prompt}\n\n{'='*50}" | |
all_prompts.append(section) | |
clear_memory() | |
return '\n'.join(all_prompts) | |
except Exception as e: | |
print(f"Error generating prompts: {e}") | |
clear_memory() | |
return "Error generating prompts. Please try again." | |
def generate_story_image(prompt, seed=-1): | |
clear_memory() | |
pipe = load_sd_model() | |
try: | |
pipe.load_lora_weights("Prof-Hunt/lora-bulldog") | |
generator = torch.Generator("cuda") | |
if seed != -1: | |
generator.manual_seed(seed) | |
else: | |
generator.manual_seed(torch.randint(0, 2**32 - 1, (1,)).item()) | |
enhanced_prompt = f"{prompt}, watercolor style, children's book illustration, soft colors" | |
image = pipe( | |
prompt=enhanced_prompt, | |
negative_prompt="deformed, ugly, blurry, bad art, poor quality, distorted", | |
num_inference_steps=50, | |
guidance_scale=15, | |
generator=generator | |
).images[0] | |
pipe.unload_lora_weights() | |
del pipe | |
clear_memory() | |
return image | |
except Exception as e: | |
print(f"Error generating image: {e}") | |
if 'pipe' in locals(): | |
pipe.unload_lora_weights() | |
del pipe | |
clear_memory() | |
return None | |
def generate_all_scenes(prompts_text): | |
clear_memory() | |
generated_images = [] | |
formatted_prompts = [] | |
progress_messages = [] | |
total_scenes = len([s for s in prompts_text.split('='*50) if s.strip()]) | |
def update_progress(): | |
"""Create a progress message showing completed/total scenes""" | |
completed = len(generated_images) | |
message = f"Generated {completed}/{total_scenes} scenes\n\n" | |
if progress_messages: | |
message += "\n".join(progress_messages[-3:]) # Show last 3 status messages | |
return message | |
sections = prompts_text.split('='*50) | |
for section_num, section in enumerate(sections, 1): | |
if not section.strip(): | |
continue | |
scene_prompt = None | |
for line in section.split('\n'): | |
if 'Scenery Prompt' in line: | |
scene_num = line.split('Scenery Prompt')[1].split(':')[0].strip() | |
next_line_index = section.split('\n').index(line) + 1 | |
if next_line_index < len(section.split('\n')): | |
scene_prompt = section.split('\n')[next_line_index].strip() | |
formatted_prompts.append(f"Scene {scene_num}: {scene_prompt}") | |
break | |
if scene_prompt: | |
try: | |
clear_memory() | |
status_msg = f"π¨ Creating scene {section_num}: '{scene_prompt[:50]}...'" | |
progress_messages.append(status_msg) | |
# Yield progress update | |
yield generated_images, "\n\n".join(formatted_prompts), update_progress() | |
image = generate_story_image(scene_prompt) | |
if image is not None: | |
# Convert PIL Image to numpy array with explicit mode conversion | |
pil_image = image if isinstance(image, Image.Image) else Image.fromarray(image) | |
pil_image = pil_image.convert('RGB') # Ensure RGB mode | |
img_array = np.array(pil_image) | |
# Verify array shape and type | |
if len(img_array.shape) == 3 and img_array.shape[2] == 3: | |
generated_images.append(img_array) | |
progress_messages.append(f"β Successfully completed scene {section_num}") | |
else: | |
progress_messages.append(f"β Error: Invalid image format for scene {section_num}") | |
else: | |
progress_messages.append(f"β Failed to generate scene {section_num}") | |
clear_memory() | |
except Exception as e: | |
error_msg = f"β Error generating scene {section_num}: {str(e)}" | |
progress_messages.append(error_msg) | |
clear_memory() | |
continue | |
# Yield progress update after each scene | |
yield generated_images, "\n\n".join(formatted_prompts), update_progress() | |
# Final status update | |
if not generated_images: | |
progress_messages.append("β No images were successfully generated") | |
else: | |
progress_messages.append(f"β Successfully completed all {len(generated_images)} scenes!") | |
# Final yield | |
yield generated_images, "\n\n".join(formatted_prompts), update_progress() | |
def add_text_to_scenes(gallery_images, prompts_text): | |
"""Add text overlays to all scenes""" | |
print(f"Received gallery_images type: {type(gallery_images)}") | |
print(f"Number of images in gallery: {len(gallery_images) if isinstance(gallery_images, list) else 0}") | |
if not isinstance(gallery_images, list): | |
print("Gallery images must be a list") | |
return [], [] | |
clear_memory() | |
# Process text sections | |
sections = prompts_text.split('='*50) | |
overlaid_images = [] | |
output_files = [] | |
# Create temporary directory for saving files | |
temp_dir = "temp_book_pages" | |
os.makedirs(temp_dir, exist_ok=True) | |
for i, (img_data, section) in enumerate(zip(gallery_images, sections)): | |
if not section.strip(): | |
continue | |
print(f"\nProcessing image {i+1}:") | |
print(f"Image data type: {type(img_data)}") | |
try: | |
# Handle tuple from Gradio gallery | |
if isinstance(img_data, tuple): | |
filepath = img_data[0] if isinstance(img_data[0], str) else None | |
print(f"Found filepath: {filepath}") | |
if filepath and os.path.exists(filepath): | |
print(f"Loading image from: {filepath}") | |
image = Image.open(filepath).convert('RGB') | |
else: | |
print(f"Invalid filepath: {filepath}") | |
continue | |
else: | |
print(f"Unexpected image data type: {type(img_data)}") | |
continue | |
# Extract paragraph text | |
lines = [line.strip() for line in section.split('\n') if line.strip()] | |
paragraph = None | |
for j, line in enumerate(lines): | |
if line.startswith('Paragraph'): | |
if j + 1 < len(lines): | |
paragraph = lines[j + 1] | |
print(f"Found paragraph text for image {i+1}") | |
break | |
if paragraph and image: | |
# Add text overlay | |
overlaid_img = overlay_text_on_image(image, paragraph) | |
if overlaid_img is not None: | |
# Convert to numpy array for gallery display | |
overlaid_array = np.array(overlaid_img) | |
overlaid_images.append(overlaid_array) | |
# Save file for download | |
output_path = os.path.join(temp_dir, f"panel_{i+1}.png") | |
overlaid_img.save(output_path) | |
output_files.append(output_path) | |
print(f"Successfully processed image {i+1}") | |
else: | |
print(f"Failed to overlay text on image {i+1}") | |
except Exception as e: | |
print(f"Error processing image {i+1}: {str(e)}") | |
import traceback | |
print(traceback.format_exc()) | |
continue | |
if not overlaid_images: | |
print("No images were successfully processed") | |
else: | |
print(f"Successfully processed {len(overlaid_images)} images") | |
clear_memory() | |
return overlaid_images, output_files | |
def overlay_text_on_image(image, text): | |
"""Add black text with white outline for better visibility.""" | |
if image is None: | |
return None | |
try: | |
# Ensure we're working with RGB mode | |
img = image.convert('RGB') | |
draw = ImageDraw.Draw(img) | |
# Calculate font size based on image dimensions | |
font_size = int(img.width * 0.025) | |
try: | |
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size) | |
except: | |
print("Using default font as DejaVuSans-Bold.ttf not found") | |
font = ImageFont.load_default() | |
# Calculate text positioning | |
y_position = int(img.height * 0.005) | |
x_margin = int(img.width * 0.005) | |
available_width = img.width - (2 * x_margin) | |
# Wrap text to fit image width | |
wrapped_text = textwrap.fill(text, width=int(available_width / (font_size * 0.6))) | |
# Add white outline to text for better readability | |
outline_color = (255, 255, 255) | |
text_color = (0, 0, 0) | |
offsets = [-2, -1, 1, 2] | |
# Draw text outline | |
for dx in offsets: | |
for dy in offsets: | |
draw.multiline_text( | |
(x_margin + dx, y_position + dy), | |
wrapped_text, | |
font=font, | |
fill=outline_color | |
) | |
# Draw main text | |
draw.multiline_text( | |
(x_margin, y_position), | |
wrapped_text, | |
font=font, | |
fill=text_color | |
) | |
return img | |
except Exception as e: | |
print(f"Error in overlay_text_on_image: {e}") | |
return None | |
def generate_combined_audio_from_story(story_text, voice='af_heart', speed=1): | |
"""Generate audio for the story with improved error handling and debugging""" | |
clear_memory() | |
if not story_text: | |
print("No story text provided") | |
return None | |
print(f"Generating audio for story of length: {len(story_text)}") | |
# Clean up text and split into manageable chunks | |
paragraphs = [p.strip() for p in story_text.split('\n\n') if p.strip()] | |
if not paragraphs: | |
print("No valid paragraphs found in story") | |
return None | |
print(f"Processing {len(paragraphs)} paragraphs") | |
combined_audio = [] | |
try: | |
for i, paragraph in enumerate(paragraphs): | |
if not paragraph.strip(): | |
continue | |
print(f"Processing paragraph {i+1}/{len(paragraphs)}") | |
print(f"Paragraph length: {len(paragraph)}") | |
print(f"Paragraph text: {paragraph[:100]}...") # Print first 100 chars | |
try: | |
# Generate audio for each sentence separately | |
sentences = [s.strip() for s in paragraph.split('.') if s.strip()] | |
print(f"Split into {len(sentences)} sentences") | |
for j, sentence in enumerate(sentences): | |
print(f"Processing sentence {j+1}/{len(sentences)}") | |
print(f"Sentence length: {len(sentence)}") | |
# Add more robust error handling around the generator | |
try: | |
generator = pipeline( | |
sentence + '.', # Add period back | |
voice=voice, | |
speed=speed, | |
split_pattern=r'\n+' | |
) | |
# Add type checking and validation for generator output | |
if generator is None: | |
print(f"Warning: Generator returned None for sentence: {sentence[:50]}...") | |
continue | |
# Process generator output with additional error handling | |
for batch_idx, metadata, audio in generator: | |
print(f"Processing batch {batch_idx}, audio length: {len(audio) if audio is not None else 0}") | |
if audio is not None and len(audio) > 0: | |
# Validate audio data | |
if isinstance(audio, (list, np.ndarray)): | |
combined_audio.extend(audio) | |
else: | |
print(f"Warning: Invalid audio type: {type(audio)}") | |
else: | |
print(f"Warning: Empty audio generated for sentence: {sentence[:50]}...") | |
# Add a small pause between sentences | |
combined_audio.extend([0] * 1000) # 1000 samples of silence | |
except Exception as e: | |
print(f"Error processing sentence {j+1}: {str(e)}") | |
import traceback | |
print(traceback.format_exc()) | |
continue | |
# Add a longer pause between paragraphs | |
combined_audio.extend([0] * 2000) # 2000 samples of silence | |
except Exception as e: | |
print(f"Error processing paragraph {i+1}: {str(e)}") | |
import traceback | |
print(traceback.format_exc()) | |
continue | |
if not combined_audio: | |
print("No audio was generated") | |
return None | |
# Convert combined audio to NumPy array and normalize | |
combined_audio = np.array(combined_audio) | |
if len(combined_audio) > 0: | |
# Print audio statistics | |
print(f"Final audio length: {len(combined_audio)}") | |
print(f"Audio min/max values: {np.min(combined_audio)}/{np.max(combined_audio)}") | |
# Normalize audio to prevent clipping | |
max_val = np.max(np.abs(combined_audio)) | |
if max_val > 0: | |
combined_audio = combined_audio * 0.9 / max_val | |
print("Audio normalized successfully") | |
# Save audio with error handling | |
try: | |
filename = "combined_story.wav" | |
sf.write(filename, combined_audio, 24000) | |
print(f"Successfully saved audio to {filename}") | |
return filename | |
except Exception as e: | |
print(f"Error saving audio file: {str(e)}") | |
return None | |
else: | |
print("Error: Combined audio array is empty") | |
return None | |
except Exception as e: | |
print(f"Error generating audio: {str(e)}") | |
import traceback | |
print(traceback.format_exc()) | |
clear_memory() | |
return None | |
finally: | |
clear_memory() | |
# Helper functions | |
def clean_story_output(story): | |
"""Clean up the generated story text.""" | |
story = story.replace("<|im_end|>", "") | |
story_start = story.find("Once upon") | |
if story_start == -1: | |
possible_starts = ["One day", "In a", "There was", "Champ"] | |
for marker in possible_starts: | |
story_start = story.find(marker) | |
if story_start != -1: | |
break | |
if story_start != -1: | |
story = story[story_start:] | |
lines = story.split('\n') | |
cleaned_lines = [] | |
for line in lines: | |
line = line.strip() | |
if line and not any(skip in line.lower() for skip in ['requirement', 'include these values', 'theme:', 'keep it simple', 'end with', 'write a']): | |
if not line.startswith(('1.', '2.', '3.', '4.', '5.')): | |
cleaned_lines.append(line) | |
return '\n\n'.join(cleaned_lines).strip() | |
def split_into_paragraphs(text): | |
"""Split text into paragraphs.""" | |
paragraphs = [] | |
current_paragraph = [] | |
for line in text.split('\n'): | |
line = line.strip() | |
if not line: | |
if current_paragraph: | |
paragraphs.append(' '.join(current_paragraph)) | |
current_paragraph = [] | |
else: | |
current_paragraph.append(line) | |
if current_paragraph: | |
paragraphs.append(' '.join(current_paragraph)) | |
return [p for p in paragraphs if not any(skip in p.lower() | |
for skip in ['requirement', 'include these values', 'theme:', | |
'keep it simple', 'end with', 'write a'])] | |
def process_generated_prompt(prompt, paragraph): | |
"""Process and clean up generated image prompts.""" | |
prompt = prompt.replace("<|im_start|>", "").replace("<|im_end|>", "") | |
prompt = prompt.replace("assistant", "").replace("system", "").replace("user", "") | |
cleaned_lines = [line.strip() for line in prompt.split('\n') | |
if line.strip().lower().startswith("watercolor bulldog")] | |
if cleaned_lines: | |
prompt = cleaned_lines[0] | |
else: | |
setting = "quiet town" if "quiet town" in paragraph.lower() else "park" | |
mood = "hopeful" if "wished" in paragraph.lower() else "peaceful" | |
prompt = f"Watercolor bulldog watching friends play in {setting}, {mood} atmosphere." | |
if not prompt.endswith('.'): | |
prompt = prompt + '.' | |
return prompt | |
def create_interface(): | |
# Define CSS for custom styling | |
css = """ | |
/* Global styles */ | |
.gradio-container { | |
background-color: #EBF8FF !important; | |
} | |
/* Custom button styling */ | |
.custom-button { | |
background-color: #3B82F6 !important; | |
color: white !important; | |
border: none !important; | |
border-radius: 8px !important; | |
padding: 10px 20px !important; | |
margin: 10px 0 !important; | |
min-width: 200px !important; | |
} | |
.custom-button:hover { | |
background-color: #2563EB !important; | |
} | |
/* Section styling */ | |
.section-content { | |
background-color: white !important; | |
border-radius: 12px !important; | |
padding: 20px !important; | |
margin: 10px 0 !important; | |
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05) !important; | |
} | |
/* AI Lesson box styling */ | |
.ai-lesson { | |
background-color: #FEE2E2 !important; | |
border-radius: 8px !important; | |
padding: 15px !important; | |
margin: 10px 0 !important; | |
border: 1px solid #FCA5A5 !important; | |
} | |
""" | |
with gr.Blocks(css=css) as demo: | |
gr.Markdown(""" | |
# π¨ Tech Tales: AI Children's Story Creator | |
Welcome to this educational AI story creation tool! This app demonstrates how multiple AI models | |
work together to create an illustrated children's story. Each step includes a brief AI lesson | |
to help you understand the technology being used. | |
Let's create something magical! β¨ | |
""") | |
# Step 1: Generate Landscape | |
with gr.Row(elem_classes="section-content"): | |
with gr.Column(elem_classes="ai-lesson"): | |
gr.Markdown(""" | |
### Step 1: Setting the Scene with AI πΌοΈ | |
π€ **AI Lesson: Text-to-Image Generation** | |
We're using Stable Diffusion, a powerful AI model that turns text into images. | |
How it works: | |
- Starts with random noise and gradually refines it into an image | |
- Uses millions of image-text pairs from its training | |
- Combines understanding of both language and visual elements | |
- Takes about 50 steps to create each image | |
Real-world applications: Book illustrations, concept art, product visualization | |
""") | |
with gr.Column(): | |
generate_btn = gr.Button("1. Generate Random Landscape", elem_classes="custom-button") | |
image_output = gr.Image(label="Your AI-Generated Landscape", type="pil", interactive=False) | |
# Step 2: Analyze Scene | |
with gr.Row(elem_classes="section-content"): | |
with gr.Column(elem_classes="ai-lesson"): | |
gr.Markdown(""" | |
### Step 2: Teaching AI to See ποΈ | |
π€ **AI Lesson: Vision-Language Models (VLM)** | |
Our VLM acts like an AI art critic, understanding and describing images. | |
How it works: | |
- Processes images through neural networks | |
- Identifies objects, scenes, colors, and relationships | |
- Translates visual features into natural language | |
- Uses attention mechanisms to focus on important details | |
Real-world applications: Image search, accessibility tools, medical imaging | |
""") | |
with gr.Column(): | |
analyze_btn = gr.Button("2. Get Brief Description", elem_classes="custom-button") | |
analysis_output = gr.Textbox(label="What the AI Sees", lines=3) | |
# Step 3: Create Story | |
with gr.Row(elem_classes="section-content"): | |
with gr.Column(elem_classes="ai-lesson"): | |
gr.Markdown(""" | |
### Step 3: Crafting the Narrative π | |
π€ **AI Lesson: Large Language Models** | |
Meet our AI storyteller! It uses a Large Language Model (LLM) to write creative stories. | |
How it works: | |
- Processes the scene description as context | |
- Uses pattern recognition from millions of stories | |
- Maintains narrative consistency and character development | |
- Adapts its writing style for children | |
Real-world applications: Content creation, creative writing, education | |
""") | |
with gr.Column(): | |
story_btn = gr.Button("3. Create Children's Story", elem_classes="custom-button") | |
story_output = gr.Textbox(label="Your AI-Generated Story", lines=10) | |
# Step 4: Generate Prompts | |
with gr.Row(elem_classes="section-content"): | |
with gr.Column(elem_classes="ai-lesson"): | |
gr.Markdown(""" | |
### Step 4: Planning the Illustrations π― | |
π€ **AI Lesson: Natural Language Processing** | |
The AI breaks down the story into key scenes and creates optimal image prompts. | |
How it works: | |
- Analyzes story structure and pacing | |
- Identifies key narrative moments | |
- Generates specialized prompts for each scene | |
- Ensures visual consistency across illustrations | |
Real-world applications: Content planning, storyboarding, scene composition | |
""") | |
with gr.Column(): | |
prompts_btn = gr.Button("4. Generate Scene Prompts", elem_classes="custom-button") | |
prompts_output = gr.Textbox(label="Scene Descriptions", lines=20) | |
# Step 5: Generate Scenes | |
with gr.Row(elem_classes="section-content"): | |
with gr.Column(elem_classes="ai-lesson"): | |
gr.Markdown(""" | |
### Step 5: Bringing Scenes to Life π¨ | |
π€ **AI Lesson: Specialized Image Generation** | |
Using a fine-tuned model to create consistent character illustrations. | |
How it works: | |
- Uses LoRA (Low-Rank Adaptation) for specialized training | |
- Maintains consistent character appearance | |
- Processes multiple scenes in parallel | |
- Balances creativity with prompt adherence | |
Real-world applications: Character design, animation, book illustration | |
""") | |
with gr.Column(): | |
generate_scenes_btn = gr.Button("5. Generate Story Scenes", elem_classes="custom-button") | |
scene_progress = gr.Textbox(label="Generation Progress", lines=6, interactive=False) | |
gallery = gr.Gallery(label="Story Scenes", columns=2, height="auto", interactive=False) | |
scene_prompts_display = gr.Textbox(label="Scene Details", lines=8, interactive=False) | |
# Step 6: Add Text | |
with gr.Row(elem_classes="section-content"): | |
with gr.Column(elem_classes="ai-lesson"): | |
gr.Markdown(""" | |
### Step 6: Creating Book Pages π | |
π€ **AI Lesson: Computer Vision & Layout** | |
Combining images and text requires sophisticated layout algorithms. | |
How it works: | |
- Analyzes image composition for text placement | |
- Adjusts font size and style for readability | |
- Creates visual hierarchy between elements | |
- Ensures consistent formatting across pages | |
Real-world applications: Desktop publishing, web design, digital books | |
""") | |
with gr.Column(): | |
add_text_btn = gr.Button("6. Add Text to Scenes", elem_classes="custom-button") | |
final_gallery = gr.Gallery(label="Final Book Pages", columns=2, height="auto", interactive=False) | |
download_btn = gr.File(label="Download Your Story Book", file_count="multiple", interactive=False) | |
# Step 7: Audio Generation | |
with gr.Row(elem_classes="section-content"): | |
with gr.Column(elem_classes="ai-lesson"): | |
gr.Markdown(""" | |
### Step 7: Adding Narration π§ | |
π€ **AI Lesson: Text-to-Speech Synthesis** | |
Converting our story into natural-sounding speech. | |
How it works: | |
- Uses neural networks for voice synthesis | |
- Adds appropriate emotion and emphasis | |
- Controls pacing and pronunciation | |
- Maintains consistent voice throughout | |
Real-world applications: Audiobooks, accessibility tools, virtual assistants | |
""") | |
with gr.Column(): | |
tts_btn = gr.Button("7. Read Story Aloud", elem_classes="custom-button") | |
audio_output = gr.Audio(label="Story Narration") | |
# Event handlers | |
generate_btn.click(fn=generate_image, outputs=image_output) | |
analyze_btn.click(fn=analyze_image, inputs=[image_output], outputs=analysis_output) | |
story_btn.click(fn=generate_story, inputs=[analysis_output], outputs=story_output) | |
prompts_btn.click(fn=generate_image_prompts, inputs=[story_output], outputs=prompts_output) | |
generate_scenes_btn.click( | |
fn=generate_all_scenes, | |
inputs=[prompts_output], | |
outputs=[gallery, scene_prompts_display, scene_progress] | |
) | |
add_text_btn.click( | |
fn=add_text_to_scenes, | |
inputs=[gallery, prompts_output], | |
outputs=[final_gallery, download_btn] | |
) | |
tts_btn.click(fn=generate_combined_audio_from_story, inputs=[story_output], outputs=audio_output) | |
return demo | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch() |