import os import gradio as gr import requests import json import tempfile from dotenv import load_dotenv import spaces def check_gpu_availability(): """Check if GPU is available and return status""" return "🚀 Running on Hugging Face Spaces with GPU acceleration" # Load environment variables load_dotenv() TIMESTAMP_INTERVAL = 2 @spaces.GPU def create_transcript_file(transcript: str, original_filename: str = None): """Create a transcript file for download""" if not transcript or transcript.strip() == "": return None try: if original_filename: # Extract the base name without extension base_name = os.path.splitext(os.path.basename(original_filename))[0] filename = f"Transcript_{base_name}.txt" else: filename = "Transcript.txt" filepath = os.path.join(tempfile.gettempdir(), filename) with open(filepath, "w", encoding="utf-8") as f: f.write(transcript) print(f"Transcript file created: {filepath}") return filepath except Exception as e: print(f"Error creating transcript file: {e}") return None @spaces.GPU def transcribe_audio(audio_file): """ Transcribe audio/video file using ElevenLabs API Args: audio_file: Uploaded audio/video file Returns: Transcribed text and optional download file """ try: # Get API key from environment api_key = os.getenv("ELEVENLABS_API_KEY") if not api_key: return "❌ Error: No API key found. Please set ELEVENLABS_API_KEY in your .env file." if not audio_file: return "❌ Error: Please upload an audio or video file." # Prepare the request url = "https://api.elevenlabs.io/v1/speech-to-text" headers = {"xi-api-key": api_key} # Prepare the files and data for the request with open(audio_file, "rb") as audio_data: files = { "file": (os.path.basename(audio_file), audio_data), "model_id": (None, "scribe_v1"), "language_code": (None, "no"), "tag_audio_events": (None, "true"), "timestamps_granularity": (None, "word"), "file_format": (None, "other"), "webhook_metadata": (None, json.dumps({"start": "0", "end": "2"})), } # Make the API request response = requests.post(url, headers=headers, files=files) if response.status_code == 200: result = response.json() text = result.get("text", "No text found in response") # Process all words to include timestamps every 2 seconds timestamped_text = "" i = 0 cur_text = "" for word in result.get("words", []): text, start, end = ( word.get("text"), word.get("start"), word.get("end"), ) print(i, ". ", f"<{text}>", start, end) cur_text += f"{text} " if start >= (i + 1) * TIMESTAMP_INTERVAL: lower_bound, upper_bound = ( i * TIMESTAMP_INTERVAL, (i + 1) * TIMESTAMP_INTERVAL, ) start_minutes = lower_bound // 60 start_seconds = lower_bound % 60 end_minutes = upper_bound // 60 end_seconds = upper_bound % 60 timestamped_text += f"[{start_minutes:02d}:{start_seconds:02d} - {end_minutes:02d}:{end_seconds:02d}]\n{cur_text.strip()}\n\n" i += 1 cur_text = "" display_text = timestamped_text # Create transcript file for download filepath = create_transcript_file(display_text, audio_file) return display_text, filepath else: return f"❌ API Error: {response.status_code} - {response.text}", None except Exception as e: return f"❌ Error during transcription: {str(e)}", None @spaces.GPU def create_interface(): """Create and configure the Gradio interface""" with gr.Blocks( title="Audio/Video Transcriber", theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 800px !important; margin: 0 auto !important; } .main-header { text-align: center; margin-bottom: 2rem; } """, ) as interface: gr.HTML( f"""

đŸŽĩ Audio/Video Transcriber

Upload audio or video files and transcribe them using ElevenLabs API

{check_gpu_availability()}

""" ) with gr.Row(): with gr.Column(scale=2): # File upload component audio_input = gr.Audio( label="Upload Audio/Video File", type="filepath", ) # Transcribe button transcribe_btn = gr.Button( "🎤 Transcribe Audio/Video", variant="primary", size="lg" ) with gr.Column(scale=2): # Output area output_text = gr.Textbox( label="Transcription Result", placeholder="Transcribed text will appear here...", lines=15, max_lines=20, interactive=False, ) # Download button download_btn = gr.DownloadButton( label="đŸ“Ĩ Download Transcript", variant="secondary", visible=True, ) # Instructions and info with gr.Accordion("â„šī¸ Instructions & Information", open=False): gr.HTML( """

How to use:

  1. Upload an audio or video file (supported formats: MP3, WAV, MP4, MOV, etc.)
  2. Click "Transcribe Audio/Video"

Transcription Model:

This application uses the scribe_v1 model for high-quality transcription.

API Key Setup:

Get your API key from ElevenLabs

Create a .env file in the project root with: ELEVENLABS_API_KEY=your_key_here

""" ) # Connect the transcribe function transcribe_btn.click( fn=transcribe_audio, inputs=[audio_input], outputs=[output_text, download_btn], ) # Example usage gr.Examples( examples=[], inputs=[audio_input], label="Example Files (upload your own files to test)", ) return interface # For Hugging Face Spaces, launch the interface directly print("🚀 Starting Audio/Video Transcriber...") print(f"📊 {check_gpu_availability()}") print("📝 Make sure you have set your ELEVENLABS_API_KEY in the .env file") print("🌐 Opening Gradio interface...") interface = create_interface() interface.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)