Spaces:
Running
Running
""" | |
Gradio note-taking app that: | |
1. Records voice via microphone | |
2. Transcribes to text with Whisper (openai/whisper-large-v3) | |
3. Generates a diagram image from the text with FLUX | |
4. Displays the note and the diagram side-by-side | |
""" | |
import os | |
import tempfile | |
import gradio as gr | |
from huggingface_hub import InferenceClient | |
# ------------------------------------------------------------------ | |
# Configuration | |
# ------------------------------------------------------------------ | |
HF_TOKEN = os.getenv("HF_TOKEN") # export HF_TOKEN=... | |
if not HF_TOKEN: | |
raise RuntimeError("Set HF_TOKEN environment variable") | |
client = InferenceClient( | |
provider="fal-ai", | |
api_key=HF_TOKEN, | |
bill_to="huggingface", | |
) | |
# ------------------------------------------------------------------ | |
# Core helpers | |
# ------------------------------------------------------------------ | |
def transcribe(audio_path: str) -> str: | |
"""Transcribe audio file to text using Whisper.""" | |
transcription = client.automatic_speech_recognition( | |
audio_path, | |
model="openai/whisper-large-v3", | |
) | |
return transcription["text"] | |
def generate_diagram(text: str) -> str: | |
"""Generate a diagram image from text using FLUX, save to tmp file and return path.""" | |
image = client.text_to_image( | |
prompt=f"Clean, simple diagram illustrating: {text}", | |
model="black-forest-labs/FLUX.1-schnell", | |
width=768, | |
height=512, | |
) | |
tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False) | |
image.save(tmp.name) | |
return tmp.name | |
# ------------------------------------------------------------------ | |
# Gradio UI | |
# ------------------------------------------------------------------ | |
def process_voice(audio): | |
"""Chain transcription + diagram generation.""" | |
text = transcribe(audio) | |
img_path = generate_diagram(text) | |
return text, img_path | |
with gr.Blocks(title="Voice-to-Diagram Note Taker") as demo: | |
gr.Markdown("# π€ Voice Note & Diagram Generator") | |
gr.Markdown("Speak into the microphone; your words become a note and an auto-generated diagram.") | |
with gr.Row(): | |
mic = gr.Audio(sources="microphone", type="filepath", label="Record") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
note_text = gr.Textbox(label="Transcription", lines=5, interactive=True) | |
with gr.Column(scale=1): | |
diagram_img = gr.Image(label="Generated Diagram") | |
mic.change(fn=process_voice, inputs=mic, outputs=[note_text, diagram_img]) | |
demo.launch() |