Upload README.md with huggingface_hub
Browse files
README.md
CHANGED
@@ -1,246 +1,145 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
try:
|
128 |
-
waveform, sample_rate = sf.read(
|
129 |
-
str(audio_path),
|
130 |
-
dtype="float32",
|
131 |
-
always_2d=True
|
132 |
-
)
|
133 |
-
except Exception as e:
|
134 |
-
logging.error(f"Error reading audio file {audio_path}: {e}")
|
135 |
-
raise HTTPException(status_code=400, detail=f"Could not read audio file: {e}")
|
136 |
-
|
137 |
-
if sample_rate != 16000:
|
138 |
-
# Basic resampling could be added here if needed, or just raise an error
|
139 |
-
logging.warning(f"Audio sample rate is {sample_rate}Hz, expected 16000Hz. Results may be suboptimal.")
|
140 |
-
# For now, we proceed but log a warning. For critical applications, convert or reject.
|
141 |
-
|
142 |
-
logging.info(f"Processing audio: {audio_path}, Duration: {len(waveform) / sample_rate:.2f}s, Channels: {waveform.shape[1]}")
|
143 |
-
|
144 |
-
lang_code = languages.get(request.language.lower())
|
145 |
-
if lang_code is None:
|
146 |
-
logging.warning(f"Unsupported language: {request.language}. Defaulting to 'en'. Supported: {list(languages.keys())}")
|
147 |
-
lang_code = languages.get("en", 0) # Fallback to 'en' or 'auto' if 'en' isn't in languages
|
148 |
-
|
149 |
-
all_segments_text: List[str] = []
|
150 |
-
detailed_segments: List[Segment] = []
|
151 |
-
processing_start_time = time.time()
|
152 |
-
|
153 |
-
for channel_id in range(waveform.shape[1]):
|
154 |
-
channel_data = waveform[:, channel_id]
|
155 |
-
logging.info(f"Processing channel {channel_id + 1}/{waveform.shape[1]}")
|
156 |
-
|
157 |
-
try:
|
158 |
-
# Ensure channel_data is 1D for VAD if it expects that
|
159 |
-
speech_segments = vad_model.segments_offline(channel_data) # segments_offline expects 1D array
|
160 |
-
except Exception as e:
|
161 |
-
logging.error(f"VAD processing failed for channel {channel_id}: {e}")
|
162 |
-
# Optionally skip this channel or raise an error for the whole request
|
163 |
-
continue # Skip to next channel
|
164 |
-
|
165 |
-
for part_idx, part in enumerate(speech_segments):
|
166 |
-
start_sample = int(part[0] * 16) # VAD returns ms, convert to samples (16 samples/ms for 16kHz)
|
167 |
-
end_sample = int(part[1] * 16)
|
168 |
-
segment_audio = channel_data[start_sample:end_sample]
|
169 |
-
|
170 |
-
if len(segment_audio) == 0:
|
171 |
-
logging.info(f"Empty audio segment for channel {channel_id}, part {part_idx}. Skipping.")
|
172 |
-
continue
|
173 |
-
|
174 |
-
try:
|
175 |
-
# Ensure get_features expects 1D array
|
176 |
-
audio_feats = w_frontend.get_features(segment_audio)
|
177 |
-
# ASR model expects batch dimension, add [None, ...]
|
178 |
-
asr_result_text_raw = asr_model(
|
179 |
-
audio_feats[None, ...],
|
180 |
-
language=lang_code,
|
181 |
-
use_itn=request.use_itn,
|
182 |
-
)
|
183 |
-
# Remove tags like <|en|>, <|HAPPY|>, etc.
|
184 |
-
asr_result_text_cleaned = re.sub(r"<\|[^\|]+\|>", "", asr_result_text_raw).strip()
|
185 |
-
|
186 |
-
segment_start_s = part[0] / 1000.0
|
187 |
-
segment_end_s = part[1] / 1000.0
|
188 |
-
logging.info(f"[Ch{channel_id}] [{segment_start_s:.2f}s - {segment_end_s:.2f}s] Raw: {asr_result_text_raw} Cleaned: {asr_result_text_cleaned}")
|
189 |
-
all_segments_text.append(asr_result_text_cleaned)
|
190 |
-
detailed_segments.append(Segment(start_time_s=segment_start_s, end_time_s=segment_end_s, text=asr_result_text_cleaned))
|
191 |
-
except Exception as e:
|
192 |
-
logging.error(f"ASR processing failed for segment {part_idx} in channel {channel_id}: {e}")
|
193 |
-
# Optionally add a placeholder or skip this segment's text
|
194 |
-
detailed_segments.append(Segment(start_time_s=part[0]/1000.0, end_time_s=part[1]/1000.0, text="[ASR_ERROR]"))
|
195 |
-
|
196 |
-
vad_model.vad.all_reset_detection() # Reset VAD state for next channel or call
|
197 |
-
|
198 |
-
full_transcription = " ".join(all_segments_text).strip()
|
199 |
-
logging.info(f"Transcription complete in {time.time() - processing_start_time:.2f}s. Result: {full_transcription}")
|
200 |
-
|
201 |
-
return full_transcription
|
202 |
-
|
203 |
-
if __name__ == "__main__":
|
204 |
-
import uvicorn
|
205 |
-
|
206 |
-
MINIMAL_LOGGING_CONFIG = {
|
207 |
-
"version": 1,
|
208 |
-
"disable_existing_loggers": False, # Let other loggers (like our app logger) exist
|
209 |
-
"formatters": {
|
210 |
-
"default": {
|
211 |
-
"()": "uvicorn.logging.DefaultFormatter",
|
212 |
-
"fmt": "%(levelprefix)s %(message)s",
|
213 |
-
"use_colors": None,
|
214 |
-
},
|
215 |
},
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
"stream": "ext://sys.stderr",
|
221 |
-
},
|
222 |
},
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
"propagate": False,
|
228 |
-
},
|
229 |
-
"uvicorn.error": { # Logs for errors within Uvicorn
|
230 |
-
"handlers": ["default"],
|
231 |
-
"level": logging.INFO, # Explicitly use integer
|
232 |
-
"propagate": False,
|
233 |
-
},
|
234 |
-
# We are deliberately not configuring uvicorn.access here for simplicity
|
235 |
-
# It might default to INFO or be silent if not configured and no parent handler catches it.
|
236 |
},
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
"propagate": False,
|
242 |
}
|
|
|
243 |
}
|
|
|
244 |
|
245 |
-
logger.info(f"Attempting to run Uvicorn with minimal explicit log_config.")
|
246 |
-
uvicorn.run(app, host="0.0.0.0", port=8000, log_config=MINIMAL_LOGGING_CONFIG)
|
|
|
1 |
+
---
|
2 |
+
license: agpl-3.0
|
3 |
+
language:
|
4 |
+
- en
|
5 |
+
- zh
|
6 |
+
- ja
|
7 |
+
- ko
|
8 |
+
base_model: lovemefan/SenseVoice-onnx
|
9 |
+
tags:
|
10 |
+
- rknn
|
11 |
+
---
|
12 |
+
|
13 |
+
# SenseVoiceSmall-RKNN2
|
14 |
+
|
15 |
+
SenseVoice is an audio foundation model with audio understanding capabilities, including Automatic Speech Recognition (ASR), Language Identification (LID), Speech Emotion Recognition (SER), and Acoustic Event Classification (AEC) or Acoustic Event Detection (AED).
|
16 |
+
|
17 |
+
Currently, SenseVoice-small supports multilingual speech recognition, emotion recognition, and event detection for Chinese, Cantonese, English, Japanese, and Korean, with extremely low inference latency.
|
18 |
+
|
19 |
+
- Inference speed (RKNN2): About 20x real-time on a single NPU core of RK3588 (processing 20 seconds of audio per second), approximately 6 times faster than the official whisper model provided in the rknn-model-zoo.
|
20 |
+
- Memory usage (RKNN2): About 1.1GB
|
21 |
+
|
22 |
+
## Usage
|
23 |
+
|
24 |
+
1. Clone the project to your local machine
|
25 |
+
|
26 |
+
2. Install dependencies
|
27 |
+
|
28 |
+
```bash
|
29 |
+
pip install kaldi_native_fbank onnxruntime sentencepiece soundfile pyyaml numpy<2
|
30 |
+
|
31 |
+
pip install rknn_toolkit_lite2-2.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
|
32 |
+
```
|
33 |
+
[Source](https://github.com/airockchip/rknn-toolkit2/blob/master/rknn-toolkit-lite2/packages/rknn_toolkit_lite2-2.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl) of the .whl file:
|
34 |
+
|
35 |
+
3. Copy librknnt.so to /usr/lib/
|
36 |
+
|
37 |
+
Source of librknnt.so: https://github.com/airockchip/rknn-toolkit2/blob/master/rknpu2/runtime/Linux/librknn_api/aarch64/librknnrt.so
|
38 |
+
|
39 |
+
4. Run
|
40 |
+
|
41 |
+
```bash
|
42 |
+
python ./sensevoice_rknn.py --audio_file english.wav
|
43 |
+
```
|
44 |
+
|
45 |
+
If you find that recognition is not working correctly when testing with your own audio files, you may need to convert them to 16kHz, 16-bit, mono WAV format in advance.
|
46 |
+
|
47 |
+
```bash
|
48 |
+
ffmpeg -i input.mp3 -f wav -acodec pcm_s16le -ac 1 -ar 16000 output.wav
|
49 |
+
```
|
50 |
+
|
51 |
+
## RKNN Model Conversion
|
52 |
+
|
53 |
+
You need to install rknn-toolkit2 v2.1.0 or higher in advance.
|
54 |
+
|
55 |
+
1. Download or convert the ONNX model
|
56 |
+
|
57 |
+
You can download the ONNX model from https://huggingface.co/lovemefan/SenseVoice-onnx.
|
58 |
+
It should also be possible to convert from a PyTorch model to an ONNX model according to the documentation at https://github.com/FunAudioLLM/SenseVoice.
|
59 |
+
|
60 |
+
The model file should be named 'sense-voice-encoder.onnx' and placed in the same directory as the conversion script.
|
61 |
+
|
62 |
+
2. Convert to RKNN model
|
63 |
+
```bash
|
64 |
+
python convert_rknn.py
|
65 |
+
```
|
66 |
+
|
67 |
+
## Known Issues
|
68 |
+
|
69 |
+
- When using fp16 inference with RKNN2, overflow may occur, resulting in inf values. You can try modifying the scaling ratio of the input data to resolve this.
|
70 |
+
Set `SPEECH_SCALE` to a smaller value in `sensevoice_rknn.py`.
|
71 |
+
|
72 |
+
## References
|
73 |
+
- [FunAudioLLM/SenseVoiceSmall](https://huggingface.co/FunAudioLLM/SenseVoiceSmall)
|
74 |
+
- [lovemefan/SenseVoice-python](https://github.com/lovemefan/SenseVoice-python)
|
75 |
+
|
76 |
+
|
77 |
+
## FastAPI Transcription Server
|
78 |
+
|
79 |
+
This project includes a FastAPI server (`server.py`) that provides an HTTP endpoint for speech-to-text transcription.
|
80 |
+
|
81 |
+
### Running the Server
|
82 |
+
|
83 |
+
1. Ensure all dependencies for `sensevoice_rknn.py` and the server are installed. This includes `fastapi` and `uvicorn`:
|
84 |
+
```bash
|
85 |
+
pip install fastapi uvicorn
|
86 |
+
```
|
87 |
+
2. Place the required model files (`*.rknn`, `*.onnx`, `spm.model`) in the same directory as `server.py`.
|
88 |
+
3. Run the server:
|
89 |
+
```bash
|
90 |
+
python server.py
|
91 |
+
```
|
92 |
+
The server will start on `http://0.0.0.0:8000` by default.
|
93 |
+
|
94 |
+
### API Endpoint: `/transcribe`
|
95 |
+
|
96 |
+
* **Method:** `POST`
|
97 |
+
* **Description:** Transcribes the audio file specified in the request.
|
98 |
+
* **Request Body:** JSON object with the following fields:
|
99 |
+
* `audio_file_path` (string, required): The absolute path to the WAV audio file on the server's filesystem.
|
100 |
+
* `language` (string, optional, default: `"en"`): The language code for transcription. Supported codes depend on the model (e.g., "en", "zh", "ja", "ko").
|
101 |
+
* `use_itn` (boolean, optional, default: `false`): Whether to apply Inverse Text Normalization to the transcription output.
|
102 |
+
|
103 |
+
* **Example Request (`curl`):**
|
104 |
+
```bash
|
105 |
+
curl -X POST -H "Content-Type: application/json" \
|
106 |
+
-d '{"audio_file_path": "/path/to/your/audio.wav", "language": "en", "use_itn": false}' \
|
107 |
+
http://0.0.0.0:8000/transcribe
|
108 |
+
```
|
109 |
+
|
110 |
+
* **Response Body:** JSON object with the following fields:
|
111 |
+
* `full_transcription` (string): The complete transcribed text, including any special tokens from the model.
|
112 |
+
* `segments` (list of objects): A list where each object represents a transcribed audio segment and contains:
|
113 |
+
* `start_time_s` (float): Start time of the segment in seconds.
|
114 |
+
* `end_time_s` (float): End time of the segment in seconds.
|
115 |
+
* `text` (string): Transcribed text for the segment.
|
116 |
+
|
117 |
+
* **Example Response:**
|
118 |
+
```json
|
119 |
+
{
|
120 |
+
"full_transcription": "<|en|><|HAPPY|><|Speech|><|woitn|>the stale smell of old beer lingers <|en|><|NEUTRAL|><|Speech|><|woitn|>it takes heat to bring out the odor but <|en|><|HAPPY|><|Speech|><|woitn|>a cold dip restores health and zest a salt pickle tastes fine with ham tacos al pastor are my favorite <|en|><|EMO_UNKNOWN|><|Speech|><|woitn|>a zestful food is the hot cross bun",
|
121 |
+
"segments": [
|
122 |
+
{
|
123 |
+
"start_time_s": 1.01,
|
124 |
+
"end_time_s": 3.93,
|
125 |
+
"text": "<|en|><|HAPPY|><|Speech|><|woitn|>the stale smell of old beer lingers"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
},
|
127 |
+
{
|
128 |
+
"start_time_s": 4.21,
|
129 |
+
"end_time_s": 6.59,
|
130 |
+
"text": "<|en|><|NEUTRAL|><|Speech|><|woitn|>it takes heat to bring out the odor but"
|
|
|
|
|
131 |
},
|
132 |
+
{
|
133 |
+
"start_time_s": 6.87,
|
134 |
+
"end_time_s": 14.68,
|
135 |
+
"text": "<|en|><|HAPPY|><|Speech|><|woitn|>a cold dip restores health and zest a salt pickle tastes fine with ham tacos al pastor are my favorite"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
},
|
137 |
+
{
|
138 |
+
"start_time_s": 14.96,
|
139 |
+
"end_time_s": 18.34,
|
140 |
+
"text": "<|en|><|EMO_UNKNOWN|><|Speech|><|woitn|>a zestful food is the hot cross bun"
|
|
|
141 |
}
|
142 |
+
]
|
143 |
}
|
144 |
+
```
|
145 |
|
|
|
|