Spaces:

tianyaogavin
/

faster-whisper-small

Running

App Files Files Community

tianyaogavin commited on Mar 14

Commit

c967100

1 Parent(s): 070daf0

init submit

Browse files

Files changed (6) hide show

.gitignore +1 -0
Dockerfile +10 -0
README.md +14 -11
app.py +37 -0
requirement.txt +3 -0
vad_realtime_transcribe.py +95 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.wav

Dockerfile ADDED Viewed

	@@ -0,0 +1,10 @@

+FROM python:3.10-slim
+WORKDIR /code
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY app/ ./app/
+CMD ["uvicorn", "app.app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,11 +1,14 @@
----
-title: Faster Whisper Small
-emoji: 📉
-colorFrom: pink
-colorTo: blue
-sdk: docker
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Whisper Transcription API on Hugging Face Spaces
+This Space provides a simple REST API to transcribe audio using faster-whisper.
+### Endpoints
+- `GET /` → health check
+- `POST /transcribe` → Transcribe audio file (wav/mp3/etc.)
+### Example Usage (curl)
+```bash
+curl -X POST https://your-space-name.hf.space/transcribe \
+  -F "[email protected]"

app.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from faster_whisper import WhisperModel
+from fastapi import FastAPI, UploadFile, File
+import uvicorn
+import os
+app = FastAPI()
+# ✅ 加载模型（small模型 + CPU）
+model = WhisperModel("small", device="cpu", compute_type="int8")
+@app.get("/")
+def root():
+    return {"message": "Whisper API is running."}
+@app.post("/transcribe")
+async def transcribe(file: UploadFile = File(...)):
+    temp_path = f"/tmp/{file.filename}"
+    with open(temp_path, "wb") as f:
+        f.write(await file.read())
+    segments, info = model.transcribe(temp_path, beam_size=1, language="zh")
+    results = []
+    for segment in segments:
+        results.append({
+            "start": segment.start,
+            "end": segment.end,
+            "text": segment.text
+        })
+    return {
+        "language": info.language,
+        "segments": results
+    }
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirement.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+faster-whisper
+fastapi
+uvicorn

vad_realtime_transcribe.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import sounddevice as sd
+import webrtcvad
+import numpy as np
+from scipy.io.wavfile import write
+from faster_whisper import WhisperModel
+import time
+import os
+SAMPLE_RATE = 16000
+FRAME_DURATION = 30  # ms
+FRAME_SIZE = int(SAMPLE_RATE * FRAME_DURATION / 1000)
+SILENCE_THRESHOLD = 0.5  # 秒，连续静音多久算结束
+MAX_RECORD_SECONDS = 15  # 最大安全时长
+MIN_SPEECH_DURATION = 0.3  # 忽略太短的无效段落
+# ✅ 初始化 Whisper 模型（只加载一次）
+print("📥 Loading Whisper model...")
+model = WhisperModel("small", device="cpu", compute_type="int8")
+def record_and_detect(filename="audio.wav"):
+    vad = webrtcvad.Vad(2)
+    frames = []
+    silence_counter = 0
+    speech_detected = False
+    max_silence_frames = int(SILENCE_THRESHOLD * 1000 / FRAME_DURATION)
+    stream = sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype='int16', blocksize=FRAME_SIZE)
+    stream.start()
+    print("🎙️ 说话开始（说完停顿自动结束）...")
+    try:
+        while True:
+            frame, _ = stream.read(FRAME_SIZE)
+            pcm = frame.flatten()
+            pcm_bytes = pcm.tobytes()
+            is_speech = vad.is_speech(pcm_bytes, SAMPLE_RATE)
+            frames.append((pcm.copy(), is_speech))
+            if is_speech:
+                silence_counter = 0
+                speech_detected = True
+            else:
+                silence_counter += 1
+            if speech_detected and silence_counter >= max_silence_frames:
+                print("🛑 停顿检测完成，结束录音")
+                break
+    finally:
+        stream.stop()
+        stream.close()
+    # ✅ 剪掉尾部静音帧
+    cut_index = len(frames)
+    for i in range(len(frames) - 1, -1, -1):
+        if frames[i][1]:  # 是语音
+            cut_index = i + 1
+            break
+    trimmed_audio = np.concatenate([frames[i][0] for i in range(cut_index)])
+    duration = len(trimmed_audio) / SAMPLE_RATE
+    if duration < MIN_SPEECH_DURATION:
+        print("⚠️ 忽略无效短录音")
+        return None
+    write(filename, SAMPLE_RATE, trimmed_audio.astype(np.int16))
+    print(f"💾 已保存音频：{filename} (长度: {duration:.2f}s)")
+    return filename
+def transcribe(filename):
+    print("🔍 开始转录...")
+    t1 = time.time()
+    segments, info = model.transcribe(filename, beam_size=3)
+    t2 = time.time()
+    print(f"✅ 检测语言: {info.language}")
+    segment_list = list(segments)
+    if not segment_list:
+        print("⚠️ 没识别到语音内容")
+    else:
+        print("📄 识别内容：")
+        for seg in segment_list:
+            print(f"[{seg.start:.2f}s → {seg.end:.2f}s] {seg.text}")
+    print(f"⏱️ 转录耗时：{t2 - t1:.2f}s")
+if __name__ == "__main__":
+    while True:
+        audio_file = record_and_detect()
+        if audio_file:
+            transcribe(audio_file)
+        print("\n✅ 等待下一轮语音输入（Ctrl+C退出）...\n")