Spaces:

Tiberiw
/

thesis

Paused

thesis / app.py

Tiberiw

Update application file

0a05afb about 1 month ago

6.45 kB

	import os
	import tempfile
	import torch
	from contextlib import asynccontextmanager
	from fastapi import FastAPI, UploadFile, File, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
	from peft import PeftModel
	import librosa
	from pydub import AudioSegment
	from dotenv import load_dotenv

	transcriber = None

	@asynccontextmanager
	async def lifespan(app: FastAPI):
	global transcriber

	cache_dir = "/tmp/hf_cache"
	os.makedirs(cache_dir, exist_ok=True)
	os.environ["HF_HOME"] = cache_dir
	os.environ["TRANSFORMERS_CACHE"] = cache_dir
	os.environ["HF_HUB_CACHE"] = cache_dir

	# Add numba cache directory for librosa
	numba_cache_dir = os.path.join(cache_dir, "numba_cache")
	os.makedirs(numba_cache_dir, exist_ok=True)
	os.environ["NUMBA_CACHE_DIR"] = numba_cache_dir

	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if device == "cuda:0" else torch.float32
	load_dotenv(override=True) # Load environment variables from .env file
	print("After load_dotenv, HF_TOKEN:", os.getenv("HF_TOKEN"))

	hf_token = os.getenv("HF_TOKEN")

	# Add a check to ensure the token is provided
	if hf_token is None:
	raise ValueError("Hugging Face token not found. Please set the HUGGING_FACE_TOKEN environment variable.")

	BASE_MODEL_PATH = "openai/whisper-large-v3-turbo"
	# BASE_MODEL_PATH = "openai/whisper-large-v3-turbo"
	ADAPTER_AND_PROCESSOR_PATH = "Tiberiw/whisper-large-turbo-lora-finetuned-v3"
	# ADAPTER_AND_PROCESSOR_PATH = "Tiberiw/whisper-large-turbo-lora-finetuned-v3"
	processor = WhisperProcessor.from_pretrained(
	ADAPTER_AND_PROCESSOR_PATH,
	token=hf_token,
	cache_dir=cache_dir
	)
	base_model = WhisperForConditionalGeneration.from_pretrained(
	BASE_MODEL_PATH,
	torch_dtype=torch_dtype,
	cache_dir=cache_dir
	)
	final_model = PeftModel.from_pretrained(
	base_model,
	ADAPTER_AND_PROCESSOR_PATH,
	token=hf_token,
	cache_dir=cache_dir
	)
	transcriber = pipeline(
	"automatic-speech-recognition",
	model=final_model,
	torch_dtype=torch_dtype,
	device=device,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	)
	print("Model loaded successfully!")
	yield

	app = FastAPI(lifespan=lifespan)

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	def load_audio(path: str):
	try:
	audio_array, _ = librosa.load(path, sr=16000, mono=True)
	return audio_array
	except Exception as e:
	import traceback
	msg = f"Error processing audio failed to load audio: {str(e)}\n{traceback.format_exc()}"
	if any(err in str(e) for err in ["NoBackendError", "SoundFileNotOpen", "Unsupported format", "AudioreadError"]):
	raise HTTPException(status_code=415, detail=msg + "\nSupported formats: ( WEBM, WAV, MP3, FLAC)")
	raise HTTPException(status_code=500, detail=msg)




	@app.post("/api/transcription")
	async def transcribe_pipeline(file: UploadFile = File(...)):
	if not file.content_type or not file.content_type.startswith("audio/"):
	raise HTTPException(status_code=400, detail="Invalid file content type.")

	print(f"Received file: {file.filename}, Content-Type: {file.content_type}")

	original_temp_path = None # Path to the originally uploaded file
	input_for_librosa_path = None # Path to the file librosa will load (either original or converted)


	try:
	# 1. Save the uploaded file to a temporary location first.
	# This gives us a file path to work with, which is often easier for external tools like FFmpeg via pydub.
	file_suffix = ".unknown"
	if file.filename:
	_, ext = os.path.splitext(file.filename)
	if ext:
	file_suffix = ext
	print(f"Saving uploaded file to temporary location with suffix '{file_suffix}'")
	with tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix) as temp_orig_file:
	content = await file.read()
	temp_orig_file.write(content)
	original_temp_path = temp_orig_file.name

	# It's good practice to close the UploadFile object after reading its content
	await file.close()

	if file.content_type.startswith("audio/webm"):
	print(f"Conversion needed for '{original_temp_path}' (ContentType: {file.content_type}) to MP3.")
	# Define a path for the converted MP3 file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_conv_file:
	input_for_librosa_path = temp_conv_file.name

	try:
	# Load the WebM audio from the original temporary file using pydub
	# pydub's from_file can often infer the format, or you can specify format="webm" or format="opus"
	audio = AudioSegment.from_file(original_temp_path) # pydub will use FFmpeg here

	# Export as MP3 to the new temporary file path
	audio.export(input_for_librosa_path, format="mp3")
	print(f"Successfully converted '{original_temp_path}' to MP3: '{input_for_librosa_path}'")
	except Exception as e:
	import traceback
	err_msg = f"Audio conversion failed: {str(e)}\n{traceback.format_exc()}"
	if "ffmpeg" in str(e).lower():
	err_msg += "\nEnsure FFmpeg is installed and in PATH."
	raise HTTPException(status_code=500, detail=err_msg)
	else:
	input_for_librosa_path = original_temp_path
	original_temp_path = None


	audio_array = load_audio(input_for_librosa_path)
	result = transcriber(audio_array.copy(), return_timestamps=True)
	return {"transcription": result["text"]}

	except HTTPException:
	raise
	except Exception as e:
	import traceback
	raise HTTPException(status_code=500, detail=f"Unexpected error : {str(e)}\n{traceback.format_exc()}")
	finally:
	for f in (original_temp_path, input_for_librosa_path):
	if f and os.path.exists(f):
	os.unlink(f)