Spaces:

petersvenning
/

norwegian-whisper-transcriber

Paused

Ilya Emeliyanov

update: no spaces flag

96f3c52 about 1 month ago

7.86 kB

	import os
	import gradio as gr
	import requests
	import json
	import tempfile
	from dotenv import load_dotenv

	import spaces


	def check_gpu_availability():
	"""Check if GPU is available and return status"""
	return "🚀 Running on Hugging Face Spaces with GPU acceleration"


	# Load environment variables
	load_dotenv()

	TIMESTAMP_INTERVAL = 2


	@spaces.GPU
	def create_transcript_file(transcript: str, original_filename: str = None):
	"""Create a transcript file for download"""
	if not transcript or transcript.strip() == "":
	return None

	try:
	if original_filename:
	# Extract the base name without extension
	base_name = os.path.splitext(os.path.basename(original_filename))[0]
	filename = f"Transcript_{base_name}.txt"
	else:
	filename = "Transcript.txt"

	filepath = os.path.join(tempfile.gettempdir(), filename)

	with open(filepath, "w", encoding="utf-8") as f:
	f.write(transcript)

	print(f"Transcript file created: {filepath}")
	return filepath

	except Exception as e:
	print(f"Error creating transcript file: {e}")
	return None


	@spaces.GPU
	def transcribe_audio(audio_file):
	"""
	Transcribe audio/video file using ElevenLabs API

	Args:
	audio_file: Uploaded audio/video file

	Returns:
	Transcribed text and optional download file
	"""
	try:
	# Get API key from environment
	api_key = os.getenv("ELEVENLABS_API_KEY")
	if not api_key:
	return "❌ Error: No API key found. Please set ELEVENLABS_API_KEY in your .env file."

	if not audio_file:
	return "❌ Error: Please upload an audio or video file."

	# Prepare the request
	url = "https://api.elevenlabs.io/v1/speech-to-text"
	headers = {"xi-api-key": api_key}

	# Prepare the files and data for the request
	with open(audio_file, "rb") as audio_data:
	files = {
	"file": (os.path.basename(audio_file), audio_data),
	"model_id": (None, "scribe_v1"),
	"language_code": (None, "no"),
	"tag_audio_events": (None, "true"),
	"timestamps_granularity": (None, "word"),
	"file_format": (None, "other"),
	"webhook_metadata": (None, json.dumps({"start": "0", "end": "2"})),
	}

	# Make the API request
	response = requests.post(url, headers=headers, files=files)

	if response.status_code == 200:
	result = response.json()
	text = result.get("text", "No text found in response")

	# Process all words to include timestamps every 2 seconds
	timestamped_text = ""
	i = 0
	cur_text = ""
	for word in result.get("words", []):
	text, start, end = (
	word.get("text"),
	word.get("start"),
	word.get("end"),
	)

	print(i, ". ", f"<{text}>", start, end)

	cur_text += f"{text} "

	if start >= (i + 1) * TIMESTAMP_INTERVAL:
	lower_bound, upper_bound = (
	i * TIMESTAMP_INTERVAL,
	(i + 1) * TIMESTAMP_INTERVAL,
	)
	start_minutes = lower_bound // 60
	start_seconds = lower_bound % 60
	end_minutes = upper_bound // 60
	end_seconds = upper_bound % 60
	timestamped_text += f"[{start_minutes:02d}:{start_seconds:02d} - {end_minutes:02d}:{end_seconds:02d}]\n{cur_text.strip()}\n\n"
	i += 1
	cur_text = ""

	display_text = timestamped_text

	# Create transcript file for download
	filepath = create_transcript_file(display_text, audio_file)

	return display_text, filepath
	else:
	return f"❌ API Error: {response.status_code} - {response.text}", None

	except Exception as e:
	return f"❌ Error during transcription: {str(e)}", None


	@spaces.GPU
	def create_interface():
	"""Create and configure the Gradio interface"""

	with gr.Blocks(
	title="Audio/Video Transcriber",
	theme=gr.themes.Soft(),
	css="""
	.gradio-container {
	max-width: 800px !important;
	margin: 0 auto !important;
	}
	.main-header {
	text-align: center;
	margin-bottom: 2rem;
	}
	""",
	) as interface:

	gr.HTML(
	f"""
	<div class="main-header">
	<h1>🎵 Audio/Video Transcriber</h1>
	<p>Upload audio or video files and transcribe them using ElevenLabs API</p>
	<p style="font-size: 0.9em; color: #666; margin-top: 10px;">{check_gpu_availability()}</p>
	</div>
	"""
	)

	with gr.Row():
	with gr.Column(scale=2):
	# File upload component
	audio_input = gr.Audio(
	label="Upload Audio/Video File",
	type="filepath",
	)

	# Transcribe button
	transcribe_btn = gr.Button(
	"🎤 Transcribe Audio/Video", variant="primary", size="lg"
	)

	with gr.Column(scale=2):
	# Output area
	output_text = gr.Textbox(
	label="Transcription Result",
	placeholder="Transcribed text will appear here...",
	lines=15,
	max_lines=20,
	interactive=False,
	)

	# Download button
	download_btn = gr.DownloadButton(
	label="📥 Download Transcript",
	variant="secondary",
	visible=True,
	)

	# Instructions and info
	with gr.Accordion("ℹ️ Instructions & Information", open=False):
	gr.HTML(
	"""
	<div style="padding: 1rem;">
	<h3>How to use:</h3>
	<ol>
	<li>Upload an audio or video file (supported formats: MP3, WAV, MP4, MOV, etc.)</li>
	<li>Click "Transcribe Audio/Video"</li>
	</ol>

	<h3>Transcription Model:</h3>
	<p>This application uses the <strong>scribe_v1</strong> model for high-quality transcription.</p>

	<h3>API Key Setup:</h3>
	<p>Get your API key from <a href="https://elevenlabs.io/" target="_blank">ElevenLabs</a></p>
	<p>Create a .env file in the project root with: <code>ELEVENLABS_API_KEY=your_key_here</code></p>
	</div>
	"""
	)

	# Connect the transcribe function
	transcribe_btn.click(
	fn=transcribe_audio,
	inputs=[audio_input],
	outputs=[output_text, download_btn],
	)

	# Example usage
	gr.Examples(
	examples=[],
	inputs=[audio_input],
	label="Example Files (upload your own files to test)",
	)

	return interface


	# For Hugging Face Spaces, launch the interface directly
	print("🚀 Starting Audio/Video Transcriber...")
	print(f"📊 {check_gpu_availability()}")
	print("📝 Make sure you have set your ELEVENLABS_API_KEY in the .env file")
	print("🌐 Opening Gradio interface...")

	interface = create_interface()
	interface.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)