|
import os |
|
import gradio as gr |
|
import requests |
|
import json |
|
import tempfile |
|
from dotenv import load_dotenv |
|
|
|
import spaces |
|
|
|
|
|
def check_gpu_availability(): |
|
"""Check if GPU is available and return status""" |
|
return "π Running on Hugging Face Spaces with GPU acceleration" |
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
TIMESTAMP_INTERVAL = 2 |
|
|
|
|
|
@spaces.GPU |
|
def create_transcript_file(transcript: str, original_filename: str = None): |
|
"""Create a transcript file for download""" |
|
if not transcript or transcript.strip() == "": |
|
return None |
|
|
|
try: |
|
if original_filename: |
|
|
|
base_name = os.path.splitext(os.path.basename(original_filename))[0] |
|
filename = f"Transcript_{base_name}.txt" |
|
else: |
|
filename = "Transcript.txt" |
|
|
|
filepath = os.path.join(tempfile.gettempdir(), filename) |
|
|
|
with open(filepath, "w", encoding="utf-8") as f: |
|
f.write(transcript) |
|
|
|
print(f"Transcript file created: {filepath}") |
|
return filepath |
|
|
|
except Exception as e: |
|
print(f"Error creating transcript file: {e}") |
|
return None |
|
|
|
|
|
@spaces.GPU |
|
def transcribe_audio(audio_file): |
|
""" |
|
Transcribe audio/video file using ElevenLabs API |
|
|
|
Args: |
|
audio_file: Uploaded audio/video file |
|
|
|
Returns: |
|
Transcribed text and optional download file |
|
""" |
|
try: |
|
|
|
api_key = os.getenv("ELEVENLABS_API_KEY") |
|
if not api_key: |
|
return "β Error: No API key found. Please set ELEVENLABS_API_KEY in your .env file." |
|
|
|
if not audio_file: |
|
return "β Error: Please upload an audio or video file." |
|
|
|
|
|
url = "https://api.elevenlabs.io/v1/speech-to-text" |
|
headers = {"xi-api-key": api_key} |
|
|
|
|
|
with open(audio_file, "rb") as audio_data: |
|
files = { |
|
"file": (os.path.basename(audio_file), audio_data), |
|
"model_id": (None, "scribe_v1"), |
|
"language_code": (None, "no"), |
|
"tag_audio_events": (None, "true"), |
|
"timestamps_granularity": (None, "word"), |
|
"file_format": (None, "other"), |
|
"webhook_metadata": (None, json.dumps({"start": "0", "end": "2"})), |
|
} |
|
|
|
|
|
response = requests.post(url, headers=headers, files=files) |
|
|
|
if response.status_code == 200: |
|
result = response.json() |
|
text = result.get("text", "No text found in response") |
|
|
|
|
|
timestamped_text = "" |
|
i = 0 |
|
cur_text = "" |
|
for word in result.get("words", []): |
|
text, start, end = ( |
|
word.get("text"), |
|
word.get("start"), |
|
word.get("end"), |
|
) |
|
|
|
print(i, ". ", f"<{text}>", start, end) |
|
|
|
cur_text += f"{text} " |
|
|
|
if start >= (i + 1) * TIMESTAMP_INTERVAL: |
|
lower_bound, upper_bound = ( |
|
i * TIMESTAMP_INTERVAL, |
|
(i + 1) * TIMESTAMP_INTERVAL, |
|
) |
|
start_minutes = lower_bound // 60 |
|
start_seconds = lower_bound % 60 |
|
end_minutes = upper_bound // 60 |
|
end_seconds = upper_bound % 60 |
|
timestamped_text += f"[{start_minutes:02d}:{start_seconds:02d} - {end_minutes:02d}:{end_seconds:02d}]\n{cur_text.strip()}\n\n" |
|
i += 1 |
|
cur_text = "" |
|
|
|
display_text = timestamped_text |
|
|
|
|
|
filepath = create_transcript_file(display_text, audio_file) |
|
|
|
return display_text, filepath |
|
else: |
|
return f"β API Error: {response.status_code} - {response.text}", None |
|
|
|
except Exception as e: |
|
return f"β Error during transcription: {str(e)}", None |
|
|
|
|
|
@spaces.GPU |
|
def create_interface(): |
|
"""Create and configure the Gradio interface""" |
|
|
|
with gr.Blocks( |
|
title="Audio/Video Transcriber", |
|
theme=gr.themes.Soft(), |
|
css=""" |
|
.gradio-container { |
|
max-width: 800px !important; |
|
margin: 0 auto !important; |
|
} |
|
.main-header { |
|
text-align: center; |
|
margin-bottom: 2rem; |
|
} |
|
""", |
|
) as interface: |
|
|
|
gr.HTML( |
|
f""" |
|
<div class="main-header"> |
|
<h1>π΅ Audio/Video Transcriber</h1> |
|
<p>Upload audio or video files and transcribe them using ElevenLabs API</p> |
|
<p style="font-size: 0.9em; color: #666; margin-top: 10px;">{check_gpu_availability()}</p> |
|
</div> |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
|
|
audio_input = gr.Audio( |
|
label="Upload Audio/Video File", |
|
type="filepath", |
|
) |
|
|
|
|
|
transcribe_btn = gr.Button( |
|
"π€ Transcribe Audio/Video", variant="primary", size="lg" |
|
) |
|
|
|
with gr.Column(scale=2): |
|
|
|
output_text = gr.Textbox( |
|
label="Transcription Result", |
|
placeholder="Transcribed text will appear here...", |
|
lines=15, |
|
max_lines=20, |
|
interactive=False, |
|
) |
|
|
|
|
|
download_btn = gr.DownloadButton( |
|
label="π₯ Download Transcript", |
|
variant="secondary", |
|
visible=True, |
|
) |
|
|
|
|
|
with gr.Accordion("βΉοΈ Instructions & Information", open=False): |
|
gr.HTML( |
|
""" |
|
<div style="padding: 1rem;"> |
|
<h3>How to use:</h3> |
|
<ol> |
|
<li>Upload an audio or video file (supported formats: MP3, WAV, MP4, MOV, etc.)</li> |
|
<li>Click "Transcribe Audio/Video"</li> |
|
</ol> |
|
|
|
<h3>Transcription Model:</h3> |
|
<p>This application uses the <strong>scribe_v1</strong> model for high-quality transcription.</p> |
|
|
|
<h3>API Key Setup:</h3> |
|
<p>Get your API key from <a href="https://elevenlabs.io/" target="_blank">ElevenLabs</a></p> |
|
<p>Create a .env file in the project root with: <code>ELEVENLABS_API_KEY=your_key_here</code></p> |
|
</div> |
|
""" |
|
) |
|
|
|
|
|
transcribe_btn.click( |
|
fn=transcribe_audio, |
|
inputs=[audio_input], |
|
outputs=[output_text, download_btn], |
|
) |
|
|
|
|
|
gr.Examples( |
|
examples=[], |
|
inputs=[audio_input], |
|
label="Example Files (upload your own files to test)", |
|
) |
|
|
|
return interface |
|
|
|
|
|
|
|
print("π Starting Audio/Video Transcriber...") |
|
print(f"π {check_gpu_availability()}") |
|
print("π Make sure you have set your ELEVENLABS_API_KEY in the .env file") |
|
print("π Opening Gradio interface...") |
|
|
|
interface = create_interface() |
|
interface.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True) |
|
|