tianyaogavin commited on
Commit
c967100
·
1 Parent(s): 070daf0

init submit

Browse files
Files changed (6) hide show
  1. .gitignore +1 -0
  2. Dockerfile +10 -0
  3. README.md +14 -11
  4. app.py +37 -0
  5. requirement.txt +3 -0
  6. vad_realtime_transcribe.py +95 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.wav
Dockerfile ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /code
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY app/ ./app/
9
+
10
+ CMD ["uvicorn", "app.app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,11 +1,14 @@
1
- ---
2
- title: Faster Whisper Small
3
- emoji: 📉
4
- colorFrom: pink
5
- colorTo: blue
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
1
+ # Whisper Transcription API on Hugging Face Spaces
2
+
3
+ This Space provides a simple REST API to transcribe audio using faster-whisper.
4
+
5
+ ### Endpoints
6
+
7
+ - `GET /` → health check
8
+ - `POST /transcribe` → Transcribe audio file (wav/mp3/etc.)
9
+
10
+ ### Example Usage (curl)
11
+
12
+ ```bash
13
+ curl -X POST https://your-space-name.hf.space/transcribe \
14
app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from faster_whisper import WhisperModel
2
+ from fastapi import FastAPI, UploadFile, File
3
+ import uvicorn
4
+ import os
5
+
6
+ app = FastAPI()
7
+
8
+ # ✅ 加载模型(small模型 + CPU)
9
+ model = WhisperModel("small", device="cpu", compute_type="int8")
10
+
11
+ @app.get("/")
12
+ def root():
13
+ return {"message": "Whisper API is running."}
14
+
15
+ @app.post("/transcribe")
16
+ async def transcribe(file: UploadFile = File(...)):
17
+ temp_path = f"/tmp/{file.filename}"
18
+ with open(temp_path, "wb") as f:
19
+ f.write(await file.read())
20
+
21
+ segments, info = model.transcribe(temp_path, beam_size=1, language="zh")
22
+
23
+ results = []
24
+ for segment in segments:
25
+ results.append({
26
+ "start": segment.start,
27
+ "end": segment.end,
28
+ "text": segment.text
29
+ })
30
+
31
+ return {
32
+ "language": info.language,
33
+ "segments": results
34
+ }
35
+
36
+ if __name__ == "__main__":
37
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirement.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ faster-whisper
2
+ fastapi
3
+ uvicorn
vad_realtime_transcribe.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sounddevice as sd
2
+ import webrtcvad
3
+ import numpy as np
4
+ from scipy.io.wavfile import write
5
+ from faster_whisper import WhisperModel
6
+ import time
7
+ import os
8
+
9
+ SAMPLE_RATE = 16000
10
+ FRAME_DURATION = 30 # ms
11
+ FRAME_SIZE = int(SAMPLE_RATE * FRAME_DURATION / 1000)
12
+ SILENCE_THRESHOLD = 0.5 # 秒,连续静音多久算结束
13
+ MAX_RECORD_SECONDS = 15 # 最大安全时长
14
+ MIN_SPEECH_DURATION = 0.3 # 忽略太短的无效段落
15
+
16
+ # ✅ 初始化 Whisper 模型(只加载一次)
17
+ print("📥 Loading Whisper model...")
18
+ model = WhisperModel("small", device="cpu", compute_type="int8")
19
+
20
+
21
+ def record_and_detect(filename="audio.wav"):
22
+ vad = webrtcvad.Vad(2)
23
+ frames = []
24
+ silence_counter = 0
25
+ speech_detected = False
26
+ max_silence_frames = int(SILENCE_THRESHOLD * 1000 / FRAME_DURATION)
27
+
28
+ stream = sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype='int16', blocksize=FRAME_SIZE)
29
+ stream.start()
30
+ print("🎙️ 说话开始(说完停顿自动结束)...")
31
+
32
+ try:
33
+ while True:
34
+ frame, _ = stream.read(FRAME_SIZE)
35
+ pcm = frame.flatten()
36
+ pcm_bytes = pcm.tobytes()
37
+ is_speech = vad.is_speech(pcm_bytes, SAMPLE_RATE)
38
+
39
+ frames.append((pcm.copy(), is_speech))
40
+
41
+ if is_speech:
42
+ silence_counter = 0
43
+ speech_detected = True
44
+ else:
45
+ silence_counter += 1
46
+
47
+ if speech_detected and silence_counter >= max_silence_frames:
48
+ print("🛑 停顿检测完成,结束录音")
49
+ break
50
+ finally:
51
+ stream.stop()
52
+ stream.close()
53
+
54
+ # ✅ 剪掉尾部静音帧
55
+ cut_index = len(frames)
56
+ for i in range(len(frames) - 1, -1, -1):
57
+ if frames[i][1]: # 是语音
58
+ cut_index = i + 1
59
+ break
60
+
61
+ trimmed_audio = np.concatenate([frames[i][0] for i in range(cut_index)])
62
+ duration = len(trimmed_audio) / SAMPLE_RATE
63
+
64
+ if duration < MIN_SPEECH_DURATION:
65
+ print("⚠️ 忽略无效短录音")
66
+ return None
67
+
68
+ write(filename, SAMPLE_RATE, trimmed_audio.astype(np.int16))
69
+ print(f"💾 已保存音频:{filename} (长度: {duration:.2f}s)")
70
+ return filename
71
+
72
+
73
+ def transcribe(filename):
74
+ print("🔍 开始转录...")
75
+ t1 = time.time()
76
+ segments, info = model.transcribe(filename, beam_size=3)
77
+ t2 = time.time()
78
+
79
+ print(f"✅ 检测语言: {info.language}")
80
+ segment_list = list(segments)
81
+ if not segment_list:
82
+ print("⚠️ 没识别到语音内容")
83
+ else:
84
+ print("📄 识别内容:")
85
+ for seg in segment_list:
86
+ print(f"[{seg.start:.2f}s → {seg.end:.2f}s] {seg.text}")
87
+ print(f"⏱️ 转录耗时:{t2 - t1:.2f}s")
88
+
89
+
90
+ if __name__ == "__main__":
91
+ while True:
92
+ audio_file = record_and_detect()
93
+ if audio_file:
94
+ transcribe(audio_file)
95
+ print("\n✅ 等待下一轮语音输入(Ctrl+C退出)...\n")