Ilya Emeliyanov
update: no spaces flag
96f3c52
import os
import gradio as gr
import requests
import json
import tempfile
from dotenv import load_dotenv
import spaces
def check_gpu_availability():
"""Check if GPU is available and return status"""
return "πŸš€ Running on Hugging Face Spaces with GPU acceleration"
# Load environment variables
load_dotenv()
TIMESTAMP_INTERVAL = 2
@spaces.GPU
def create_transcript_file(transcript: str, original_filename: str = None):
"""Create a transcript file for download"""
if not transcript or transcript.strip() == "":
return None
try:
if original_filename:
# Extract the base name without extension
base_name = os.path.splitext(os.path.basename(original_filename))[0]
filename = f"Transcript_{base_name}.txt"
else:
filename = "Transcript.txt"
filepath = os.path.join(tempfile.gettempdir(), filename)
with open(filepath, "w", encoding="utf-8") as f:
f.write(transcript)
print(f"Transcript file created: {filepath}")
return filepath
except Exception as e:
print(f"Error creating transcript file: {e}")
return None
@spaces.GPU
def transcribe_audio(audio_file):
"""
Transcribe audio/video file using ElevenLabs API
Args:
audio_file: Uploaded audio/video file
Returns:
Transcribed text and optional download file
"""
try:
# Get API key from environment
api_key = os.getenv("ELEVENLABS_API_KEY")
if not api_key:
return "❌ Error: No API key found. Please set ELEVENLABS_API_KEY in your .env file."
if not audio_file:
return "❌ Error: Please upload an audio or video file."
# Prepare the request
url = "https://api.elevenlabs.io/v1/speech-to-text"
headers = {"xi-api-key": api_key}
# Prepare the files and data for the request
with open(audio_file, "rb") as audio_data:
files = {
"file": (os.path.basename(audio_file), audio_data),
"model_id": (None, "scribe_v1"),
"language_code": (None, "no"),
"tag_audio_events": (None, "true"),
"timestamps_granularity": (None, "word"),
"file_format": (None, "other"),
"webhook_metadata": (None, json.dumps({"start": "0", "end": "2"})),
}
# Make the API request
response = requests.post(url, headers=headers, files=files)
if response.status_code == 200:
result = response.json()
text = result.get("text", "No text found in response")
# Process all words to include timestamps every 2 seconds
timestamped_text = ""
i = 0
cur_text = ""
for word in result.get("words", []):
text, start, end = (
word.get("text"),
word.get("start"),
word.get("end"),
)
print(i, ". ", f"<{text}>", start, end)
cur_text += f"{text} "
if start >= (i + 1) * TIMESTAMP_INTERVAL:
lower_bound, upper_bound = (
i * TIMESTAMP_INTERVAL,
(i + 1) * TIMESTAMP_INTERVAL,
)
start_minutes = lower_bound // 60
start_seconds = lower_bound % 60
end_minutes = upper_bound // 60
end_seconds = upper_bound % 60
timestamped_text += f"[{start_minutes:02d}:{start_seconds:02d} - {end_minutes:02d}:{end_seconds:02d}]\n{cur_text.strip()}\n\n"
i += 1
cur_text = ""
display_text = timestamped_text
# Create transcript file for download
filepath = create_transcript_file(display_text, audio_file)
return display_text, filepath
else:
return f"❌ API Error: {response.status_code} - {response.text}", None
except Exception as e:
return f"❌ Error during transcription: {str(e)}", None
@spaces.GPU
def create_interface():
"""Create and configure the Gradio interface"""
with gr.Blocks(
title="Audio/Video Transcriber",
theme=gr.themes.Soft(),
css="""
.gradio-container {
max-width: 800px !important;
margin: 0 auto !important;
}
.main-header {
text-align: center;
margin-bottom: 2rem;
}
""",
) as interface:
gr.HTML(
f"""
<div class="main-header">
<h1>🎡 Audio/Video Transcriber</h1>
<p>Upload audio or video files and transcribe them using ElevenLabs API</p>
<p style="font-size: 0.9em; color: #666; margin-top: 10px;">{check_gpu_availability()}</p>
</div>
"""
)
with gr.Row():
with gr.Column(scale=2):
# File upload component
audio_input = gr.Audio(
label="Upload Audio/Video File",
type="filepath",
)
# Transcribe button
transcribe_btn = gr.Button(
"🎀 Transcribe Audio/Video", variant="primary", size="lg"
)
with gr.Column(scale=2):
# Output area
output_text = gr.Textbox(
label="Transcription Result",
placeholder="Transcribed text will appear here...",
lines=15,
max_lines=20,
interactive=False,
)
# Download button
download_btn = gr.DownloadButton(
label="πŸ“₯ Download Transcript",
variant="secondary",
visible=True,
)
# Instructions and info
with gr.Accordion("ℹ️ Instructions & Information", open=False):
gr.HTML(
"""
<div style="padding: 1rem;">
<h3>How to use:</h3>
<ol>
<li>Upload an audio or video file (supported formats: MP3, WAV, MP4, MOV, etc.)</li>
<li>Click "Transcribe Audio/Video"</li>
</ol>
<h3>Transcription Model:</h3>
<p>This application uses the <strong>scribe_v1</strong> model for high-quality transcription.</p>
<h3>API Key Setup:</h3>
<p>Get your API key from <a href="https://elevenlabs.io/" target="_blank">ElevenLabs</a></p>
<p>Create a .env file in the project root with: <code>ELEVENLABS_API_KEY=your_key_here</code></p>
</div>
"""
)
# Connect the transcribe function
transcribe_btn.click(
fn=transcribe_audio,
inputs=[audio_input],
outputs=[output_text, download_btn],
)
# Example usage
gr.Examples(
examples=[],
inputs=[audio_input],
label="Example Files (upload your own files to test)",
)
return interface
# For Hugging Face Spaces, launch the interface directly
print("πŸš€ Starting Audio/Video Transcriber...")
print(f"πŸ“Š {check_gpu_availability()}")
print("πŸ“ Make sure you have set your ELEVENLABS_API_KEY in the .env file")
print("🌐 Opening Gradio interface...")
interface = create_interface()
interface.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)