Tiberiw commited on
Commit
9c92b55
·
1 Parent(s): 363a45c

Add application file

Browse files
Files changed (3) hide show
  1. Dockerfile +31 -0
  2. app.py +136 -0
  3. requirements.txt +74 -0
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Spaces Dockerfile for GPU deployment
2
+ FROM python:3.10-slim
3
+
4
+ # Set environment variables
5
+ ENV PYTHONDONTWRITEBYTECODE=1
6
+ ENV PYTHONUNBUFFERED=1
7
+
8
+ # Install system dependencies
9
+ RUN apt-get update && apt-get install -y \
10
+ ffmpeg \
11
+ libsndfile1 \
12
+ curl \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ # Set working directory
16
+ WORKDIR /app
17
+
18
+ # Copy requirements and install Python dependencies
19
+ COPY requirements.txt .
20
+ RUN pip install --no-cache-dir --upgrade pip && \
21
+ pip install torch==2.1.2+cu118 torchaudio==2.1.2+cu118 torchvision==0.16.2+cu118 --index-url https://download.pytorch.org/whl/cu118 && \
22
+ pip install --no-cache-dir -r requirements.txt
23
+
24
+ # Copy application code
25
+ COPY . .
26
+
27
+ # Expose port (Hugging Face Spaces uses port 7860 by default)
28
+ EXPOSE 7860
29
+
30
+ # Command to run the application
31
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import torch
4
+ from contextlib import asynccontextmanager
5
+ from fastapi import FastAPI, UploadFile, File, HTTPException
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
8
+ from peft import PeftModel
9
+ import librosa
10
+ from pydub import AudioSegment
11
+ from dotenv import load_dotenv
12
+
13
+ transcriber = None
14
+
15
+ @asynccontextmanager
16
+ async def lifespan(app: FastAPI):
17
+ global transcriber
18
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
19
+ torch_dtype = torch.float16 if device == "cuda:0" else torch.float32
20
+ load_dotenv(override=True) # Load environment variables from .env file
21
+ print("After load_dotenv, HF_TOKEN:", os.getenv("HF_TOKEN"))
22
+
23
+ hf_token = os.getenv("HF_TOKEN")
24
+
25
+ # Add a check to ensure the token is provided
26
+ if hf_token is None:
27
+ raise ValueError("Hugging Face token not found. Please set the HUGGING_FACE_TOKEN environment variable.")
28
+
29
+ BASE_MODEL_PATH = "openai/whisper-base"
30
+ # BASE_MODEL_PATH = "openai/whisper-large-v3-turbo"
31
+ ADAPTER_AND_PROCESSOR_PATH = "Tiberiw/whisper-base-lora-finetuned-custom-v1"
32
+ # ADAPTER_AND_PROCESSOR_PATH = "Tiberiw/whisper-large-turbo-lora-finetuned-v3"
33
+ processor = WhisperProcessor.from_pretrained(ADAPTER_AND_PROCESSOR_PATH, token=hf_token)
34
+ base_model = WhisperForConditionalGeneration.from_pretrained(BASE_MODEL_PATH, torch_dtype=torch_dtype)
35
+ final_model = PeftModel.from_pretrained(base_model, ADAPTER_AND_PROCESSOR_PATH, token=hf_token)
36
+ transcriber = pipeline(
37
+ "automatic-speech-recognition",
38
+ model=final_model,
39
+ torch_dtype=torch_dtype,
40
+ device=device,
41
+ tokenizer=processor.tokenizer,
42
+ feature_extractor=processor.feature_extractor,
43
+ )
44
+ print("Model loaded successfully!")
45
+ yield
46
+
47
+ app = FastAPI(lifespan=lifespan)
48
+
49
+ app.add_middleware(
50
+ CORSMiddleware,
51
+ allow_origins=["*"],
52
+ allow_credentials=True,
53
+ allow_methods=["*"],
54
+ allow_headers=["*"],
55
+ )
56
+
57
+ def load_audio(path: str):
58
+ try:
59
+ audio_array, _ = librosa.load(path, sr=16000, mono=True)
60
+ return audio_array
61
+ except Exception as e:
62
+ import traceback
63
+ msg = f"Error processing audio failed to load audio: {str(e)}\n{traceback.format_exc()}"
64
+ if any(err in str(e) for err in ["NoBackendError", "SoundFileNotOpen", "Unsupported format", "AudioreadError"]):
65
+ raise HTTPException(status_code=415, detail=msg + "\nSupported formats: ( WEBM, WAV, MP3, FLAC)")
66
+ raise HTTPException(status_code=500, detail=msg)
67
+
68
+
69
+
70
+
71
+ @app.post("/api/transcription")
72
+ async def transcribe_pipeline(file: UploadFile = File(...)):
73
+ if not file.content_type or not file.content_type.startswith("audio/"):
74
+ raise HTTPException(status_code=400, detail="Invalid file content type.")
75
+
76
+ print(f"Received file: {file.filename}, Content-Type: {file.content_type}")
77
+
78
+ original_temp_path = None # Path to the originally uploaded file
79
+ input_for_librosa_path = None # Path to the file librosa will load (either original or converted)
80
+
81
+
82
+ try:
83
+ # 1. Save the uploaded file to a temporary location first.
84
+ # This gives us a file path to work with, which is often easier for external tools like FFmpeg via pydub.
85
+ file_suffix = ".unknown"
86
+ if file.filename:
87
+ _, ext = os.path.splitext(file.filename)
88
+ if ext:
89
+ file_suffix = ext
90
+ print(f"Saving uploaded file to temporary location with suffix '{file_suffix}'")
91
+ with tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix) as temp_orig_file:
92
+ content = await file.read()
93
+ temp_orig_file.write(content)
94
+ original_temp_path = temp_orig_file.name
95
+
96
+ # It's good practice to close the UploadFile object after reading its content
97
+ await file.close()
98
+
99
+ if file.content_type.startswith("audio/webm"):
100
+ print(f"Conversion needed for '{original_temp_path}' (ContentType: {file.content_type}) to MP3.")
101
+ # Define a path for the converted MP3 file
102
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_conv_file:
103
+ input_for_librosa_path = temp_conv_file.name
104
+
105
+ try:
106
+ # Load the WebM audio from the original temporary file using pydub
107
+ # pydub's from_file can often infer the format, or you can specify format="webm" or format="opus"
108
+ audio = AudioSegment.from_file(original_temp_path) # pydub will use FFmpeg here
109
+
110
+ # Export as MP3 to the new temporary file path
111
+ audio.export(input_for_librosa_path, format="mp3")
112
+ print(f"Successfully converted '{original_temp_path}' to MP3: '{input_for_librosa_path}'")
113
+ except Exception as e:
114
+ import traceback
115
+ err_msg = f"Audio conversion failed: {str(e)}\n{traceback.format_exc()}"
116
+ if "ffmpeg" in str(e).lower():
117
+ err_msg += "\nEnsure FFmpeg is installed and in PATH."
118
+ raise HTTPException(status_code=500, detail=err_msg)
119
+ else:
120
+ input_for_librosa_path = original_temp_path
121
+ original_temp_path = None
122
+
123
+
124
+ audio_array = load_audio(input_for_librosa_path)
125
+ result = transcriber(audio_array.copy(), return_timestamps=True)
126
+ return {"transcription": result["text"]}
127
+
128
+ except HTTPException:
129
+ raise
130
+ except Exception as e:
131
+ import traceback
132
+ raise HTTPException(status_code=500, detail=f"Unexpected error : {str(e)}\n{traceback.format_exc()}")
133
+ finally:
134
+ for f in (original_temp_path, input_for_librosa_path):
135
+ if f and os.path.exists(f):
136
+ os.unlink(f)
requirements.txt ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==1.7.0
2
+ annotated-types==0.7.0
3
+ anyio==4.9.0
4
+ audioread==3.0.1
5
+ certifi==2025.6.15
6
+ cffi==1.17.1
7
+ charset-normalizer==3.4.2
8
+ click==8.2.1
9
+ colorama==0.4.6
10
+ decorator==5.2.1
11
+ dnspython==2.7.0
12
+ email_validator==2.2.0
13
+ exceptiongroup==1.3.0
14
+ fastapi==0.115.12
15
+ fastapi-cli==0.0.7
16
+ filelock==3.18.0
17
+ fsspec==2025.5.1
18
+ h11==0.16.0
19
+ httpcore==1.0.9
20
+ httptools==0.6.4
21
+ httpx==0.28.1
22
+ huggingface-hub==0.33.0
23
+ idna==3.10
24
+ Jinja2==3.1.6
25
+ joblib==1.5.1
26
+ lazy_loader==0.4
27
+ librosa==0.11.0
28
+ llvmlite==0.44.0
29
+ markdown-it-py==3.0.0
30
+ MarkupSafe==3.0.2
31
+ mdurl==0.1.2
32
+ mpmath==1.3.0
33
+ msgpack==1.1.1
34
+ networkx==3.4.2
35
+ numba==0.61.2
36
+ numpy<2
37
+ packaging==25.0
38
+ peft==0.15.2
39
+ pillow==11.0.0
40
+ platformdirs==4.3.8
41
+ pooch==1.8.2
42
+ psutil==7.0.0
43
+ pycparser==2.22
44
+ pydantic==2.11.7
45
+ pydantic_core==2.33.2
46
+ pydub==0.25.1
47
+ Pygments==2.19.1
48
+ python-dotenv==1.1.0
49
+ python-multipart==0.0.20
50
+ PyYAML==6.0.2
51
+ regex==2024.11.6
52
+ requests==2.32.4
53
+ rich==14.0.0
54
+ rich-toolkit==0.14.7
55
+ safetensors==0.5.3
56
+ scikit-learn==1.7.0
57
+ scipy==1.15.3
58
+ shellingham==1.5.4
59
+ sniffio==1.3.1
60
+ soundfile==0.13.1
61
+ soxr==0.5.0.post1
62
+ starlette==0.46.2
63
+ sympy==1.14.0
64
+ threadpoolctl==3.6.0
65
+ tokenizers==0.21.1
66
+ tqdm==4.67.1
67
+ transformers==4.52.4
68
+ typer==0.16.0
69
+ typing-inspection==0.4.1
70
+ typing_extensions==4.14.0
71
+ urllib3==2.4.0
72
+ uvicorn==0.34.3
73
+ watchfiles==1.0.5
74
+ websockets==15.0.1