Commit
·
6fd5ec9
0
Parent(s):
初始提交 - 不包含二進制檔案
Browse files- .gitattributes +37 -0
- .gitignore +65 -0
- .vscodesettings_backup.json +0 -0
- README.md +49 -0
- app.py +966 -0
- assets/audio.wav +3 -0
- assets/sample_audio.mp3 +3 -0
- pyproject.toml +11 -0
- pyrightconfig.json +5 -0
- requirements.txt +22 -0
- requirements_local.txt +30 -0
.gitattributes
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python 虛擬環境
|
| 2 |
+
env/
|
| 3 |
+
env_new/
|
| 4 |
+
env_old/
|
| 5 |
+
venv/
|
| 6 |
+
ENV/
|
| 7 |
+
.env
|
| 8 |
+
.venv
|
| 9 |
+
*env*/ # 匹配所有包含 'env' 的資料夾
|
| 10 |
+
|
| 11 |
+
# Python 快取檔案
|
| 12 |
+
__pycache__/
|
| 13 |
+
*.py[cod]
|
| 14 |
+
*$py.class
|
| 15 |
+
*.so
|
| 16 |
+
.Python
|
| 17 |
+
|
| 18 |
+
# 分發/打包
|
| 19 |
+
.Python
|
| 20 |
+
build/
|
| 21 |
+
develop-eggs/
|
| 22 |
+
dist/
|
| 23 |
+
downloads/
|
| 24 |
+
eggs/
|
| 25 |
+
.eggs/
|
| 26 |
+
lib/
|
| 27 |
+
lib64/
|
| 28 |
+
parts/
|
| 29 |
+
sdist/
|
| 30 |
+
var/
|
| 31 |
+
wheels/
|
| 32 |
+
*.egg-info/
|
| 33 |
+
.installed.cfg
|
| 34 |
+
*.egg
|
| 35 |
+
|
| 36 |
+
# 單元測試/覆蓋率報告
|
| 37 |
+
htmlcov/
|
| 38 |
+
.tox/
|
| 39 |
+
.coverage
|
| 40 |
+
.coverage.*
|
| 41 |
+
.cache
|
| 42 |
+
nosetests.xml
|
| 43 |
+
coverage.xml
|
| 44 |
+
*.cover
|
| 45 |
+
.hypothesis/
|
| 46 |
+
|
| 47 |
+
# Jupyter Notebook
|
| 48 |
+
.ipynb_checkpoints
|
| 49 |
+
|
| 50 |
+
# PyCharm, VSCode 等 IDE 設定
|
| 51 |
+
.idea/
|
| 52 |
+
.vscode/
|
| 53 |
+
*.swp
|
| 54 |
+
*.swo
|
| 55 |
+
|
| 56 |
+
# 操作系統相關
|
| 57 |
+
.DS_Store
|
| 58 |
+
Thumbs.db
|
| 59 |
+
|
| 60 |
+
# 專案特定
|
| 61 |
+
*.log
|
| 62 |
+
*.sqlite3
|
| 63 |
+
"assets/*.wav"
|
| 64 |
+
"assets/*.mp3"
|
| 65 |
+
"env/"
|
.vscodesettings_backup.json
ADDED
|
File without changes
|
README.md
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Automatic Speech Recognition Speech To Text
|
| 3 |
+
emoji: 🔊🔄📝
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: yellow
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.26.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: apache-2.0
|
| 11 |
+
short_description: Automatic-Speech-Recognition-Speech-to-Text
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
| 15 |
+
|
| 16 |
+
# Clone repository
|
| 17 |
+
git clone https://huggingface.co/spaces/hchcsuim/Automatic-Speech-Recognition-Speech-to-Text
|
| 18 |
+
cd Automatic-Speech-Recognition-Speech-to-Text
|
| 19 |
+
|
| 20 |
+
# windows 安裝 ffmpeg
|
| 21 |
+
https://ffmpeg.org/download.html
|
| 22 |
+
下載版本 ffmpeg-git-full.7z
|
| 23 |
+
解壓縮到 C:\ffmpeg
|
| 24 |
+
加入環境變數 系統變數 path C:\ffmpeg\bin
|
| 25 |
+
|
| 26 |
+
# 要在電腦環境先安裝 python 3.10
|
| 27 |
+
|
| 28 |
+
# Create and activate Python environment
|
| 29 |
+
python -m venv env
|
| 30 |
+
source env/bin/activate # for linux
|
| 31 |
+
env\Scripts\activate # for windows
|
| 32 |
+
|
| 33 |
+
# 或者在 vscode 手動切換預設環境
|
| 34 |
+
Ctrl+Shift+P
|
| 35 |
+
Python: Select Interpreter
|
| 36 |
+
|
| 37 |
+
# Install dependencies
|
| 38 |
+
# requirements.txt is for Hugging Face Spaces
|
| 39 |
+
pip install -r requirements_local.txt
|
| 40 |
+
|
| 41 |
+
# 驗證 GPU 支援
|
| 42 |
+
(env_new) C:\Users\user\Automatic-Speech-Recognition-Speech-to-Text>python
|
| 43 |
+
import torch
|
| 44 |
+
print(f"CUDA available: {torch.cuda.is_available()}")
|
| 45 |
+
if torch.cuda.is_available():
|
| 46 |
+
print(f"GPU: {torch.cuda.get_device_name(0)}")
|
| 47 |
+
|
| 48 |
+
# Run the app
|
| 49 |
+
python app.py
|
app.py
ADDED
|
@@ -0,0 +1,966 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
# from PIL import Image # Keep commented unless needed
|
| 3 |
+
import torch
|
| 4 |
+
from transformers import pipeline # Keep pipeline for standard models
|
| 5 |
+
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, AutoConfig, AutoModelForCausalLM
|
| 6 |
+
import yt_dlp
|
| 7 |
+
import tempfile
|
| 8 |
+
import os
|
| 9 |
+
import shutil
|
| 10 |
+
import numpy as np
|
| 11 |
+
import time # For timestamp formatting
|
| 12 |
+
import soundfile as sf # For reading audio info
|
| 13 |
+
import traceback # For printing full errors
|
| 14 |
+
import platform
|
| 15 |
+
import re
|
| 16 |
+
import subprocess
|
| 17 |
+
|
| 18 |
+
# --- 硬體檢查函數 ---
|
| 19 |
+
def get_hardware_info():
|
| 20 |
+
"""獲取 CPU 和 GPU 信息"""
|
| 21 |
+
# 獲取 CPU 信息
|
| 22 |
+
cpu_info = "Unknown CPU"
|
| 23 |
+
try:
|
| 24 |
+
if platform.system() == "Windows":
|
| 25 |
+
output = subprocess.check_output("wmic cpu get name", shell=True).decode().strip().split('\n')
|
| 26 |
+
if len(output) >= 2:
|
| 27 |
+
cpu_info = output[1].strip()
|
| 28 |
+
elif platform.system() == "Linux":
|
| 29 |
+
with open('/proc/cpuinfo', 'r') as f:
|
| 30 |
+
for line in f:
|
| 31 |
+
if line.startswith('model name'):
|
| 32 |
+
cpu_info = line.split(':')[1].strip()
|
| 33 |
+
break
|
| 34 |
+
elif platform.system() == "Darwin": # macOS
|
| 35 |
+
output = subprocess.check_output("sysctl -n machdep.cpu.brand_string", shell=True).decode().strip()
|
| 36 |
+
cpu_info = output
|
| 37 |
+
except Exception as e:
|
| 38 |
+
print(f"Error getting CPU info: {e}")
|
| 39 |
+
|
| 40 |
+
# 獲取 GPU 信息
|
| 41 |
+
gpu_info = None
|
| 42 |
+
if torch.cuda.is_available():
|
| 43 |
+
try:
|
| 44 |
+
gpu_info = torch.cuda.get_device_name(0)
|
| 45 |
+
# print(f"GPU detected: {gpu_info}")
|
| 46 |
+
except Exception as e:
|
| 47 |
+
print(f"Error getting GPU info: {e}")
|
| 48 |
+
|
| 49 |
+
# 保留 CPU 和 GPU 的完整名稱
|
| 50 |
+
# 不進行簡化,直接返回完整名稱
|
| 51 |
+
|
| 52 |
+
return cpu_info, gpu_info
|
| 53 |
+
|
| 54 |
+
# --- Global Variables ---
|
| 55 |
+
pipe = None
|
| 56 |
+
phi4_model = None
|
| 57 |
+
phi4_processor = None
|
| 58 |
+
current_model_name = None
|
| 59 |
+
current_device = "cpu" # 默認使用 CPU
|
| 60 |
+
|
| 61 |
+
# --- Model Data ---
|
| 62 |
+
PHI4_MODEL_ID = "microsoft/Phi-4-multimodal-instruct"
|
| 63 |
+
MERALION_MODEL_ID = "MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION"
|
| 64 |
+
SEALLM_MODEL_ID = "SeaLLMs/SeaLLMs-Audio-7B"
|
| 65 |
+
|
| 66 |
+
MODEL_DATA = [
|
| 67 |
+
{"id": "openai/whisper-tiny", "params": "~39M", "size": "151 MB", "status_en": "Available", "status_zh": "可用", "type": "whisper"},
|
| 68 |
+
{"id": "openai/whisper-base", "params": "~74M", "size": "290 MB", "status_en": "Available", "status_zh": "可用", "type": "whisper"},
|
| 69 |
+
{"id": "openai/whisper-small", "params": "~244M", "size": "967 MB", "status_en": "Available", "status_zh": "可用", "type": "whisper"},
|
| 70 |
+
{"id": "openai/whisper-medium", "params": "~769M", "size": "3.06 GB", "status_en": "Available (CPU Slow)", "status_zh": "可用 (CPU 慢)", "type": "whisper"},
|
| 71 |
+
{"id": "openai/whisper-large", "params": "~1.55B", "size": "6.17 GB", "status_en": "Available (CPU Very Slow)", "status_zh": "可用 (CPU 極慢)", "type": "whisper"},
|
| 72 |
+
{"id": "openai/whisper-large-v2", "params": "~1.55B", "size": "6.17 GB", "status_en": "Available (CPU Very Slow)", "status_zh": "可用 (CPU 極慢)", "type": "whisper"},
|
| 73 |
+
{"id": "openai/whisper-large-v3", "params": "~1.55B", "size": "3.09 GB", "status_en": "Available (CPU Very Slow)", "status_zh": "可用 (CPU 極慢)", "type": "whisper"},
|
| 74 |
+
{"id": "openai/whisper-large-v3-turbo", "params": "~809M", "size": "1.62 GB", "status_en": "Available (Optimized, CPU Slow)", "status_zh": "可用 (優化, CPU 慢)", "type": "whisper"},
|
| 75 |
+
{"id": PHI4_MODEL_ID, "params": "~5.57B", "size": "11.15 GB", "status_en": "Multimodal (Need Trust, High RAM)", "status_zh": "多模態 (需信任,高RAM)", "type": "phi4"},
|
| 76 |
+
# {"id": MERALION_MODEL_ID, "params": "~9.93B", "size": "19.85 GB", "status_en": "Experimental (Need Trust, High RAM)", "status_zh": "實驗性 (需信任,高RAM)", "type": "other"},
|
| 77 |
+
# {"id": SEALLM_MODEL_ID, "params": "~8.29B", "size": "16.57 GB", "status_en": "Experimental (Need Trust, High RAM)", "status_zh": "實驗性 (需信任,高RAM)", "type": "other"},
|
| 78 |
+
]
|
| 79 |
+
MODEL_INFO_DICT = {m['id']: m for m in MODEL_DATA}
|
| 80 |
+
MODEL_CHOICES_WITH_PARAMS = [
|
| 81 |
+
(f"{m['id'].split('/')[-1]} ({m['params']}, {m['size']}) - {m['status_en']} / {m['status_zh']}", m['id'])
|
| 82 |
+
for m in MODEL_DATA
|
| 83 |
+
]
|
| 84 |
+
DEFAULT_MODEL = "openai/whisper-tiny"
|
| 85 |
+
|
| 86 |
+
# --- Language Data ---
|
| 87 |
+
BILINGUAL_LANGUAGES_DICT = {
|
| 88 |
+
"auto": "Auto-detect / 自動偵測", "en": "English / 英文", "zh": "Chinese / 中文", "de": "German / 德文", "es": "Spanish / 西班牙文",
|
| 89 |
+
"ru": "Russian / 俄文", "ko": "Korean / 韓文", "fr": "French / 法文", "ja": "Japanese / 日文", "pt": "Portuguese / 葡萄牙文", "tr": "Turkish / 土耳其文",
|
| 90 |
+
"pl": "Polish / 波蘭文", "ca": "Catalan / 加泰隆尼亞文", "nl": "Dutch / 荷蘭文", "ar": "Arabic / 阿拉伯文", "sv": "Swedish / 瑞典文", "it": "Italian / 義大利文",
|
| 91 |
+
"id": "Indonesian / 印尼文", "hi": "Hindi / 印地文", "fi": "Finnish / 芬蘭文", "vi": "Vietnamese / 越南文", "he": "Hebrew / 希伯來文", "uk": "Ukrainian / 烏克蘭文",
|
| 92 |
+
"el": "Greek / 希臘文", "ms": "Malay / 馬來文", "cs": "Czech / 捷克文", "ro": "Romanian / 羅馬尼亞文", "da": "Danish / 丹麥文", "hu": "Hungarian / 匈牙利文",
|
| 93 |
+
"ta": "Tamil / 坦米爾文", "no": "Norwegian / 挪威文", "th": "Thai / 泰文", "ur": "Urdu / 烏爾都文", "hr": "Croatian / 克羅埃西亞文", "bg": "Bulgarian / 保加利亞文",
|
| 94 |
+
"lt": "Lithuanian / 立陶宛文", "la": "Latin / 拉丁文", "mi": "Maori / 毛利文", "ml": "Malayalam / 馬拉雅拉姆文", "cy": "Welsh / 威爾斯文", "sk": "Slovak / 斯洛伐克文",
|
| 95 |
+
"te": "Telugu / 泰盧固文", "fa": "Persian / 波斯文", "lv": "Latvian / 拉脫維亞文", "bn": "Bengali / 孟加拉文", "sr": "Serbian / 塞爾維亞文", "az": "Azerbaijani / 亞塞拜然文",
|
| 96 |
+
"sl": "Slovenian / 斯洛維尼亞文", "kn": "Kannada / 坎那達文", "et": "Estonian / 愛沙尼亞文", "mk": "Macedonian / 馬其頓文", "br": "Breton / 布列塔尼文",
|
| 97 |
+
"eu": "Basque / 巴斯克文", "is": "Icelandic / 冰島文", "hy": "Armenian / 亞美尼亞文", "ne": "Nepali / 尼泊爾文", "mn": "Mongolian / 蒙古文", "bs": "Bosnian / 波士尼亞文",
|
| 98 |
+
"kk": "Kazakh / 哈薩克文", "sq": "Albanian / 阿爾巴尼亞文", "sw": "Swahili / 史瓦希里文", "gl": "Galician / 加利西亞文", "mr": "Marathi / 馬拉地文", "pa": "Punjabi / 旁遮普文",
|
| 99 |
+
"si": "Sinhala / 僧伽羅文", "km": "Khmer / 高棉文", "sn": "Shona / 修納文", "yo": "Yoruba / 約魯巴文", "so": "Somali / 索馬利文", "af": "Afrikaans / 南非荷蘭文",
|
| 100 |
+
"oc": "Occitan / 奧克西坦文", "ka": "Georgian / 喬治亞文", "be": "Belarusian / 白俄羅斯文", "tg": "Tajik / 塔吉克文", "sd": "Sindhi / 信德文", "gu": "Gujarati / 古吉拉特文",
|
| 101 |
+
"am": "Amharic / 安哈拉文", "yi": "Yiddish / 意第緒文", "lo": "Lao / 寮文", "uz": "Uzbek / 烏茲別克文", "fo": "Faroese / 法羅文", "ht": "Haitian Creole / 海地克里奧爾文",
|
| 102 |
+
"ps": "Pashto / 普什圖文", "tk": "Turkmen / 土庫曼文", "nn": "Nynorsk / 新挪威文", "mt": "Maltese / 馬爾他文", "sa": "Sanskrit / 梵文", "lb": "Luxembourgish / 盧森堡文",
|
| 103 |
+
"my": "Myanmar / 緬甸文", "bo": "Tibetan / 藏文", "tl": "Tagalog / 他加祿文", "mg": "Malagasy / 馬達加斯加文", "as": "Assamese / 阿薩姆文", "tt": "Tatar / 韃靼文",
|
| 104 |
+
"haw": "Hawaiian / 夏威夷文", "ln": "Lingala / 林加拉文", "ha": "Hausa / 豪沙文", "ba": "Bashkir / 巴什基爾文", "jw": "Javanese / 爪哇文", "su": "Sundanese / 巽他文",
|
| 105 |
+
"yue": "Cantonese / 粵語",
|
| 106 |
+
}
|
| 107 |
+
WHISPER_LANGUAGES_LIST = []
|
| 108 |
+
WHISPER_LANGUAGES_LIST.append((BILINGUAL_LANGUAGES_DICT["auto"], "auto"))
|
| 109 |
+
def get_english_name(display_name_tuple): return display_name_tuple[0].split('/')[0].strip()
|
| 110 |
+
sorted_languages = sorted( [(display_name, code) for code, display_name in BILINGUAL_LANGUAGES_DICT.items() if code != "auto"], key=get_english_name )
|
| 111 |
+
WHISPER_LANGUAGES_LIST.extend(sorted_languages)
|
| 112 |
+
PHI4_AUDIO_LANG_CODES = ["auto", "en", "zh", "de", "fr", "it", "ja", "es", "pt"]
|
| 113 |
+
PHI4_LANGUAGES_LIST = [(BILINGUAL_LANGUAGES_DICT.get(code, code), code) for code in PHI4_AUDIO_LANG_CODES]
|
| 114 |
+
|
| 115 |
+
# --- Microphone Prompt ---
|
| 116 |
+
MIC_PROMPT = """**Try Reading / 試著朗讀:**
|
| 117 |
+
"Success is stumbling from failure to failure with no loss of enthusiasm." - Winston Churchill
|
| 118 |
+
「成功是在一次又一次失敗中,依然熱情不減地前行。」 - 溫斯頓・邱吉爾"""
|
| 119 |
+
|
| 120 |
+
# --- YouTube Audio Download Function ---
|
| 121 |
+
def download_youtube_audio(url):
|
| 122 |
+
# 使用固定的目錄來存儲下載的音訊文件,這樣它們就不會被刪除
|
| 123 |
+
download_dir = os.path.join(tempfile.gettempdir(), "youtube_downloads")
|
| 124 |
+
os.makedirs(download_dir, exist_ok=True)
|
| 125 |
+
|
| 126 |
+
# 從 URL 中提取視頻 ID 作為文件名的一部分
|
| 127 |
+
video_id = url.split("v=")[-1].split("&")[0] if "v=" in url else str(int(time.time()))
|
| 128 |
+
filename = f"youtube_{video_id}_{int(time.time())}"
|
| 129 |
+
|
| 130 |
+
temp_dir = tempfile.mkdtemp()
|
| 131 |
+
downloaded_path = None
|
| 132 |
+
try:
|
| 133 |
+
temp_filepath_tmpl = os.path.join(download_dir, f"{filename}.%(ext)s")
|
| 134 |
+
ydl_opts = {
|
| 135 |
+
'format': 'bestaudio/best',
|
| 136 |
+
'outtmpl': temp_filepath_tmpl,
|
| 137 |
+
'noplaylist': True,
|
| 138 |
+
'quiet': True,
|
| 139 |
+
'postprocessors': [{'key': 'FFmpegExtractAudio','preferredcodec': 'mp3','preferredquality': '192',}],
|
| 140 |
+
'ffmpeg_location': shutil.which("ffmpeg"),
|
| 141 |
+
}
|
| 142 |
+
if not ydl_opts['ffmpeg_location']: print("Warning: ffmpeg not found... / 警告:找不到 ffmpeg...")
|
| 143 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 144 |
+
info_dict = ydl.extract_info(url, download=True)
|
| 145 |
+
duration = info_dict.get('duration')
|
| 146 |
+
title = info_dict.get('title', 'unknown')
|
| 147 |
+
|
| 148 |
+
final_filepath = ydl.prepare_filename(info_dict)
|
| 149 |
+
if not final_filepath.endswith('.mp3'):
|
| 150 |
+
base_name = final_filepath.rsplit('.', 1)[0]
|
| 151 |
+
final_filepath = base_name + '.mp3'
|
| 152 |
+
|
| 153 |
+
if os.path.exists(final_filepath):
|
| 154 |
+
downloaded_path = final_filepath
|
| 155 |
+
print(f"YouTube audio downloaded: {downloaded_path}")
|
| 156 |
+
print(f"Title: {title}, Duration: {duration}s")
|
| 157 |
+
else:
|
| 158 |
+
potential_files = [os.path.join(download_dir, f) for f in os.listdir(download_dir) if f.startswith(filename) and f.endswith(".mp3")]
|
| 159 |
+
if potential_files:
|
| 160 |
+
downloaded_path = potential_files[0]
|
| 161 |
+
print(f"Warning: Could not find expected MP3, using fallback: {downloaded_path}")
|
| 162 |
+
duration = None
|
| 163 |
+
else:
|
| 164 |
+
raise FileNotFoundError(f"Audio file not found after download in {download_dir}")
|
| 165 |
+
|
| 166 |
+
return downloaded_path, temp_dir, duration
|
| 167 |
+
except Exception as e:
|
| 168 |
+
print(f"Error processing YouTube URL: {e}")
|
| 169 |
+
if temp_dir and os.path.exists(temp_dir):
|
| 170 |
+
try: shutil.rmtree(temp_dir)
|
| 171 |
+
except Exception as cleanup_e: print(f"Error cleaning temp directory {temp_dir}: {cleanup_e}")
|
| 172 |
+
return None, None, None
|
| 173 |
+
|
| 174 |
+
# --- Timestamp Formatting ---
|
| 175 |
+
def format_timestamp(seconds):
|
| 176 |
+
if seconds is None: return "N/A"
|
| 177 |
+
milliseconds = round(seconds * 1000)
|
| 178 |
+
seconds_int = int(milliseconds // 1000)
|
| 179 |
+
milliseconds_rem = milliseconds % 1000
|
| 180 |
+
minutes = seconds_int // 60
|
| 181 |
+
seconds_rem = seconds_int % 60
|
| 182 |
+
hours = minutes // 60
|
| 183 |
+
minutes_rem = minutes % 60
|
| 184 |
+
return f"{hours:01d}:{minutes_rem:02d}:{seconds_rem:02d}.{milliseconds_rem:03d}"
|
| 185 |
+
|
| 186 |
+
# --- 下載功能 ---
|
| 187 |
+
def update_download_file(filepath):
|
| 188 |
+
"""當有音訊檔案時更新下載檔案"""
|
| 189 |
+
if filepath and os.path.exists(filepath):
|
| 190 |
+
return filepath
|
| 191 |
+
return None
|
| 192 |
+
|
| 193 |
+
# --- YouTube 音訊處理 ---
|
| 194 |
+
def process_youtube_url(youtube_url):
|
| 195 |
+
"""處理 YouTube URL,下載音訊並返回播放器和下載按鈕的更新"""
|
| 196 |
+
if not youtube_url or not youtube_url.strip():
|
| 197 |
+
return gr.update(visible=False, value=None), gr.update(visible=False, value=None)
|
| 198 |
+
|
| 199 |
+
try:
|
| 200 |
+
print(f"Processing YouTube URL: {youtube_url}")
|
| 201 |
+
# 只使用我們需要的返回值
|
| 202 |
+
audio_path, _, _ = download_youtube_audio(youtube_url)
|
| 203 |
+
|
| 204 |
+
if audio_path and os.path.exists(audio_path):
|
| 205 |
+
# 返回音訊播放器和下載按鈕的更新
|
| 206 |
+
return gr.update(visible=True, value=audio_path), gr.update(visible=True, value=audio_path)
|
| 207 |
+
else:
|
| 208 |
+
return gr.update(visible=False, value=None), gr.update(visible=False, value=None)
|
| 209 |
+
except Exception as e:
|
| 210 |
+
print(f"Error processing YouTube URL: {e}")
|
| 211 |
+
return gr.update(visible=False, value=None), gr.update(visible=False, value=None)
|
| 212 |
+
|
| 213 |
+
# --- Load ASR Pipeline ---
|
| 214 |
+
def load_asr_pipeline(model_id):
|
| 215 |
+
global pipe, phi4_model, phi4_processor, current_device
|
| 216 |
+
print(f"DEBUG: Loading ASR pipeline for {model_id} on device: {current_device}")
|
| 217 |
+
trust_code = model_id in [MERALION_MODEL_ID, SEALLM_MODEL_ID]
|
| 218 |
+
if trust_code: print(f"DEBUG: Setting trust_remote_code=True for pipeline model {model_id}")
|
| 219 |
+
try:
|
| 220 |
+
phi4_model = None
|
| 221 |
+
phi4_processor = None
|
| 222 |
+
|
| 223 |
+
# 根據選擇的設備設置模型加載參數
|
| 224 |
+
if current_device == "gpu":
|
| 225 |
+
# 檢查 CUDA 是否可用
|
| 226 |
+
if torch.cuda.is_available():
|
| 227 |
+
try:
|
| 228 |
+
# 嘗試直接使用 CUDA 設備
|
| 229 |
+
pipe = pipeline(
|
| 230 |
+
"automatic-speech-recognition",
|
| 231 |
+
model=model_id,
|
| 232 |
+
trust_remote_code=trust_code,
|
| 233 |
+
device="cuda"
|
| 234 |
+
)
|
| 235 |
+
# 注意:第一次運行時可能會出現 attention mask 警告,這是正常的,不影響功能
|
| 236 |
+
print(f"DEBUG: Using GPU (CUDA) for ASR pipeline. Available GPU: {torch.cuda.get_device_name(0)}")
|
| 237 |
+
except Exception as e:
|
| 238 |
+
# 如果直接使用 CUDA 失敗,嘗試使用 device=0
|
| 239 |
+
pipe = pipeline(
|
| 240 |
+
"automatic-speech-recognition",
|
| 241 |
+
model=model_id,
|
| 242 |
+
trust_remote_code=trust_code,
|
| 243 |
+
device=0
|
| 244 |
+
)
|
| 245 |
+
print(f"DEBUG: Using GPU (device=0) for ASR pipeline. Reason for device_map failure: {str(e)}")
|
| 246 |
+
else:
|
| 247 |
+
# 如果 CUDA 不可用,回退到 CPU 並警告用戶
|
| 248 |
+
pipe = pipeline(
|
| 249 |
+
"automatic-speech-recognition",
|
| 250 |
+
model=model_id,
|
| 251 |
+
trust_remote_code=trust_code,
|
| 252 |
+
device="cpu"
|
| 253 |
+
)
|
| 254 |
+
print("WARNING: GPU selected but CUDA is not available. Falling back to CPU.")
|
| 255 |
+
else: # CPU
|
| 256 |
+
# 使用 CPU
|
| 257 |
+
pipe = pipeline(
|
| 258 |
+
"automatic-speech-recognition",
|
| 259 |
+
model=model_id,
|
| 260 |
+
trust_remote_code=trust_code,
|
| 261 |
+
device="cpu"
|
| 262 |
+
)
|
| 263 |
+
# 注意:第一次運行時可能會出現 attention mask 警告,這是正常的,不影響功能
|
| 264 |
+
print("DEBUG: Using CPU for ASR pipeline.")
|
| 265 |
+
print(f"DEBUG: Model loaded on device: {pipe.device}")
|
| 266 |
+
return pipe
|
| 267 |
+
except Exception as e:
|
| 268 |
+
print(f"Error loading ASR pipeline for {model_id}:")
|
| 269 |
+
traceback.print_exc()
|
| 270 |
+
raise e
|
| 271 |
+
|
| 272 |
+
# --- Load Phi-4 Model ---
|
| 273 |
+
def load_phi4_model(model_id):
|
| 274 |
+
global pipe, phi4_model, phi4_processor, current_device
|
| 275 |
+
print(f"DEBUG: Loading Phi-4 model {model_id} on device: {current_device}")
|
| 276 |
+
try:
|
| 277 |
+
pipe = None
|
| 278 |
+
phi4_processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
|
| 279 |
+
|
| 280 |
+
# 根據選擇的設備設置模型加載參數
|
| 281 |
+
if current_device == "gpu":
|
| 282 |
+
# 檢查 CUDA 是否可用
|
| 283 |
+
if torch.cuda.is_available():
|
| 284 |
+
try:
|
| 285 |
+
# 嘗試直接使用 CUDA 設備
|
| 286 |
+
phi4_model = AutoModelForCausalLM.from_pretrained(
|
| 287 |
+
model_id,
|
| 288 |
+
trust_remote_code=True,
|
| 289 |
+
torch_dtype=torch.float16, # 使用半精度以節省 GPU 記憶體
|
| 290 |
+
_attn_implementation="eager",
|
| 291 |
+
)
|
| 292 |
+
phi4_model = phi4_model.to("cuda")
|
| 293 |
+
print(f"DEBUG: Using GPU (CUDA) for Phi-4. Available GPU: {torch.cuda.get_device_name(0)}")
|
| 294 |
+
except Exception as e:
|
| 295 |
+
# 如果直接使用 CUDA 失敗,嘗試使用 device=0
|
| 296 |
+
try:
|
| 297 |
+
phi4_model = AutoModelForCausalLM.from_pretrained(
|
| 298 |
+
model_id,
|
| 299 |
+
trust_remote_code=True,
|
| 300 |
+
torch_dtype=torch.float16,
|
| 301 |
+
_attn_implementation="eager",
|
| 302 |
+
)
|
| 303 |
+
phi4_model = phi4_model.to("cuda:0")
|
| 304 |
+
print(f"DEBUG: Using GPU (device=0) for Phi-4. Reason for first attempt failure: {str(e)}")
|
| 305 |
+
except Exception as e2:
|
| 306 |
+
# 如果仍然失敗,回退到 CPU
|
| 307 |
+
phi4_model = AutoModelForCausalLM.from_pretrained(
|
| 308 |
+
model_id,
|
| 309 |
+
trust_remote_code=True,
|
| 310 |
+
torch_dtype=torch.float32,
|
| 311 |
+
_attn_implementation="eager",
|
| 312 |
+
)
|
| 313 |
+
phi4_model = phi4_model.to("cpu")
|
| 314 |
+
print(f"WARNING: Failed to use GPU for Phi-4, falling back to CPU. Error: {str(e2)}")
|
| 315 |
+
else:
|
| 316 |
+
# 如果 CUDA 不可用,回退到 CPU 並警告用戶
|
| 317 |
+
phi4_model = AutoModelForCausalLM.from_pretrained(
|
| 318 |
+
model_id,
|
| 319 |
+
trust_remote_code=True,
|
| 320 |
+
torch_dtype=torch.float32, # CPU 通常使用全精度
|
| 321 |
+
_attn_implementation="eager",
|
| 322 |
+
)
|
| 323 |
+
phi4_model = phi4_model.to("cpu")
|
| 324 |
+
print("WARNING: GPU selected but CUDA is not available. Falling back to CPU for Phi-4.")
|
| 325 |
+
else: # CPU
|
| 326 |
+
# 使用 CPU
|
| 327 |
+
phi4_model = AutoModelForCausalLM.from_pretrained(
|
| 328 |
+
model_id,
|
| 329 |
+
trust_remote_code=True,
|
| 330 |
+
torch_dtype=torch.float32, # CPU 通常使用全精度
|
| 331 |
+
_attn_implementation="eager",
|
| 332 |
+
)
|
| 333 |
+
phi4_model = phi4_model.to("cpu")
|
| 334 |
+
print("DEBUG: Using CPU for Phi-4.")
|
| 335 |
+
|
| 336 |
+
print(f"DEBUG: Phi-4 model loaded on device: {next(phi4_model.parameters()).device}")
|
| 337 |
+
return phi4_model, phi4_processor
|
| 338 |
+
except Exception as e:
|
| 339 |
+
print(f"Error loading Phi-4 model {model_id}:")
|
| 340 |
+
traceback.print_exc()
|
| 341 |
+
if "scipy" in str(e) or "torchvision" in str(e) or "peft" in str(e):
|
| 342 |
+
missing_pkg = "scipy" if "scipy" in str(e) else "torchvision" if "torchvision" in str(e) else "peft"
|
| 343 |
+
raise type(e)(f"{e}. Please ensure '{missing_pkg}' is in requirements.txt") from e
|
| 344 |
+
else: raise e
|
| 345 |
+
|
| 346 |
+
# --- Main Transcription Function ---
|
| 347 |
+
def transcribe_audio(mic_input, file_input, youtube_url, selected_model_identifier,
|
| 348 |
+
task, language, return_timestamps,
|
| 349 |
+
phi4_prompt_text, device_choice,
|
| 350 |
+
previous_output_text, active_tab):
|
| 351 |
+
global pipe, phi4_model, phi4_processor, current_model_name, current_device
|
| 352 |
+
audio_source = None
|
| 353 |
+
source_type_en = ""
|
| 354 |
+
source_type_zh = ""
|
| 355 |
+
temp_dir_to_clean = None
|
| 356 |
+
audio_duration = None
|
| 357 |
+
model_name_for_display = selected_model_identifier
|
| 358 |
+
model_load_time = 0.0
|
| 359 |
+
inference_time = 0.0
|
| 360 |
+
model_type = MODEL_INFO_DICT.get(selected_model_identifier, {}).get("type", "other")
|
| 361 |
+
output_text_accumulated = previous_output_text if previous_output_text else ""
|
| 362 |
+
status_update_prefix = output_text_accumulated + ("\n\n---\n\n" if output_text_accumulated else "")
|
| 363 |
+
final_output_text = output_text_accumulated
|
| 364 |
+
|
| 365 |
+
# 更新設備選���
|
| 366 |
+
if device_choice != current_device:
|
| 367 |
+
current_device = device_choice
|
| 368 |
+
print(f"DEBUG: Device changed to {current_device}")
|
| 369 |
+
# 設備變更時強制重新加載模型
|
| 370 |
+
pipe = None
|
| 371 |
+
phi4_model = None
|
| 372 |
+
phi4_processor = None
|
| 373 |
+
current_model_name = None
|
| 374 |
+
|
| 375 |
+
# --- Load Model ---
|
| 376 |
+
model_changed = selected_model_identifier != current_model_name
|
| 377 |
+
model_needs_load = (model_type == "phi4" and phi4_model is None) or (model_type != "phi4" and pipe is None)
|
| 378 |
+
|
| 379 |
+
if model_changed or model_needs_load:
|
| 380 |
+
warning_message = ""
|
| 381 |
+
# 移除未使用的 trust_code 變量
|
| 382 |
+
if selected_model_identifier in [PHI4_MODEL_ID, MERALION_MODEL_ID, SEALLM_MODEL_ID]:
|
| 383 |
+
warning_message += f"Warning: Model {selected_model_identifier} requires executing remote code.\n警告: 模型 {selected_model_identifier} 需要執行遠端程式碼。\n"
|
| 384 |
+
if "seallms" in selected_model_identifier.lower() or "meralion" in selected_model_identifier.lower(): warning_message += f"Warning: Model {selected_model_identifier} likely requires >16GB RAM.\n警告: 模型 {selected_model_identifier} 可能需要 >16GB RAM。\n"
|
| 385 |
+
if model_type == "phi4": warning_message += f"Warning: Phi-4 uses a different process.\n警告: Phi-4 使用不同處理流程。\n"
|
| 386 |
+
print(f"Attempting to load model / 嘗試載入模型: {selected_model_identifier} (Type / 類型: {model_type})")
|
| 387 |
+
status_update_str = warning_message + f"Loading model / 正在載入模型: {selected_model_identifier}..."
|
| 388 |
+
# 不使用 yield,而是更新 output_text_accumulated
|
| 389 |
+
output_text_accumulated = status_update_prefix + status_update_str
|
| 390 |
+
|
| 391 |
+
load_start_time = time.monotonic()
|
| 392 |
+
try:
|
| 393 |
+
if model_type == "phi4":
|
| 394 |
+
phi4_model, phi4_processor = load_phi4_model(selected_model_identifier)
|
| 395 |
+
pipe = None
|
| 396 |
+
else:
|
| 397 |
+
pipe = load_asr_pipeline(selected_model_identifier)
|
| 398 |
+
phi4_model = None
|
| 399 |
+
phi4_processor = None
|
| 400 |
+
load_end_time = time.monotonic()
|
| 401 |
+
model_load_time = load_end_time - load_start_time
|
| 402 |
+
current_model_name = selected_model_identifier
|
| 403 |
+
model_name_for_display = current_model_name
|
| 404 |
+
print(f"Model {current_model_name} loaded successfully ({model_load_time:.2f}s). / 模型 {current_model_name} 載入成功 ({model_load_time:.2f} 秒).")
|
| 405 |
+
status_update_str = warning_message + f"Model {current_model_name} loaded successfully / 載入成功 ({model_load_time:.2f}s)."
|
| 406 |
+
# 更新 output_text_accumulated
|
| 407 |
+
output_text_accumulated = status_update_prefix + status_update_str
|
| 408 |
+
except Exception as e:
|
| 409 |
+
load_end_time = time.monotonic()
|
| 410 |
+
model_load_time = load_end_time - load_start_time
|
| 411 |
+
print(f"Failed to load model {selected_model_identifier} ({model_load_time:.2f}s). / 載入模型 {selected_model_identifier} 失敗 ({model_load_time:.2f} 秒).")
|
| 412 |
+
error_msg = f"Error: Failed to load model {selected_model_identifier}:\n錯誤: 載入模型 {selected_model_identifier} 失敗:\n{e}\n({model_load_time:.2f}s)"
|
| 413 |
+
if "requires `accelerate`" in str(e): error_msg += "\n**Missing 'accelerate'. Please install. / 缺少 'accelerate',請安裝.**"
|
| 414 |
+
if isinstance(e, (MemoryError, RuntimeError)) and "out of memory" in str(e).lower(): error_msg += "\n**Out of Memory. Try a smaller model. / 記憶體不足,請嘗試較小模型.**"
|
| 415 |
+
if "trust_remote_code=True" in str(e): error_msg += "\n**Requires trusting remote code. Model might be unsafe. / 需要信任遠端代碼,模型可能不安全.**"
|
| 416 |
+
if "scipy" in str(e) or "torchvision" in str(e) or "peft" in str(e):
|
| 417 |
+
missing_pkg = "scipy" if "scipy" in str(e) else "torchvision" if "torchvision" in str(e) else "peft"
|
| 418 |
+
error_msg += f"\n**Missing '{missing_pkg}'. Please install. / 缺少 '{missing_pkg}',請安裝.**"
|
| 419 |
+
status_update_str = warning_message + error_msg
|
| 420 |
+
pipe = None
|
| 421 |
+
phi4_model = None
|
| 422 |
+
phi4_processor = None
|
| 423 |
+
current_model_name = None
|
| 424 |
+
# 更新 output_text_accumulated
|
| 425 |
+
output_text_accumulated = status_update_prefix + status_update_str
|
| 426 |
+
return (output_text_accumulated, gr.update(), gr.update(), gr.update()) # Keep inputs
|
| 427 |
+
|
| 428 |
+
# --- Check if model loaded ---
|
| 429 |
+
if (model_type == "phi4" and phi4_model is None) or (model_type != "phi4" and pipe is None):
|
| 430 |
+
output_text_accumulated = status_update_prefix + "Error: Cannot use model. / 錯誤: 無法使用模型."
|
| 431 |
+
return (output_text_accumulated, gr.update(), gr.update(), gr.update())
|
| 432 |
+
|
| 433 |
+
# --- Determine Input Source & Get Duration ---
|
| 434 |
+
# 根據當前活動的標籤選擇正確的輸入源
|
| 435 |
+
print(f"DEBUG: Active tab is {active_tab}")
|
| 436 |
+
|
| 437 |
+
if active_tab == "mic" and mic_input is not None:
|
| 438 |
+
audio_source = mic_input
|
| 439 |
+
source_type_en = "Microphone"
|
| 440 |
+
source_type_zh = "麥克風"
|
| 441 |
+
elif active_tab == "file" and file_input is not None:
|
| 442 |
+
# 處理 File 組件的輸出,它可能是一個文件路徑或一個包含文件路徑的列表
|
| 443 |
+
if isinstance(file_input, list) and len(file_input) > 0:
|
| 444 |
+
# 如果是列表,取第一個文件
|
| 445 |
+
audio_source = file_input[0]
|
| 446 |
+
else:
|
| 447 |
+
# 否則直接使用
|
| 448 |
+
audio_source = file_input
|
| 449 |
+
source_type_en = "File Upload"
|
| 450 |
+
source_type_zh = "檔案上傳"
|
| 451 |
+
elif active_tab == "youtube" and youtube_url and youtube_url.strip():
|
| 452 |
+
source_type_en = "YouTube"
|
| 453 |
+
source_type_zh = "YouTube"
|
| 454 |
+
status_update_str = f"Downloading YouTube Audio / 正在下載 YouTube 音訊..."
|
| 455 |
+
output_text_accumulated = status_update_prefix + status_update_str
|
| 456 |
+
audio_path, temp_dir, duration_yt = download_youtube_audio(youtube_url)
|
| 457 |
+
if audio_path:
|
| 458 |
+
audio_source = audio_path
|
| 459 |
+
temp_dir_to_clean = temp_dir
|
| 460 |
+
audio_duration = duration_yt
|
| 461 |
+
else:
|
| 462 |
+
output_text_accumulated = status_update_prefix + "Error: Failed to download YouTube audio. / 錯誤:無法下載 YouTube 音訊。"
|
| 463 |
+
return (output_text_accumulated, gr.update(), gr.update(), gr.update())
|
| 464 |
+
else:
|
| 465 |
+
# 如果沒有選擇任何輸入源或當前標籤沒有有效輸入
|
| 466 |
+
return (previous_output_text, gr.update(), gr.update(), gr.update()) # No input
|
| 467 |
+
|
| 468 |
+
if audio_source is None:
|
| 469 |
+
output_text_accumulated = status_update_prefix + f"Error: No audio file provided. / 錯誤:未提供音訊檔案."
|
| 470 |
+
return (output_text_accumulated, gr.update(), gr.update(), gr.update())
|
| 471 |
+
|
| 472 |
+
# 確保音頻文件存在
|
| 473 |
+
if not os.path.exists(audio_source):
|
| 474 |
+
output_text_accumulated = status_update_prefix + f"Error: Audio file not found '{audio_source}'. / 錯誤:找不到音訊檔案 '{audio_source}'."
|
| 475 |
+
return (output_text_accumulated, gr.update(), gr.update(), gr.update())
|
| 476 |
+
|
| 477 |
+
# 檢查文件是否為有效的音頻文件
|
| 478 |
+
valid_audio_extensions = ['.wav', '.mp3', '.ogg', '.flac', '.m4a', '.aac']
|
| 479 |
+
file_ext = os.path.splitext(audio_source)[1].lower()
|
| 480 |
+
if file_ext not in valid_audio_extensions:
|
| 481 |
+
output_text_accumulated = status_update_prefix + f"Error: Invalid audio file format '{file_ext}'. / 錯誤:無效的音訊檔案格式 '{file_ext}'."
|
| 482 |
+
return (output_text_accumulated, gr.update(), gr.update(), gr.update())
|
| 483 |
+
|
| 484 |
+
if audio_duration is None:
|
| 485 |
+
try:
|
| 486 |
+
# 根據文件格式選擇適當的方法獲取音頻時長
|
| 487 |
+
if file_ext == '.wav':
|
| 488 |
+
# 對於 WAV 文件,使用 wave 模塊
|
| 489 |
+
import wave
|
| 490 |
+
try:
|
| 491 |
+
with wave.open(audio_source, 'rb') as wf:
|
| 492 |
+
frames = wf.getnframes()
|
| 493 |
+
rate = wf.getframerate()
|
| 494 |
+
audio_duration = frames / float(rate)
|
| 495 |
+
print(f"Got audio duration from wave module / 從 wave 模塊獲取音檔時長: {audio_duration:.2f}s")
|
| 496 |
+
except Exception as wave_err:
|
| 497 |
+
print(f"Could not get audio duration from wave module / 無法從 wave 模塊獲取音檔時長: {wave_err}")
|
| 498 |
+
# 如果 wave 模塊失敗,嘗試使用 soundfile
|
| 499 |
+
info = sf.info(audio_source)
|
| 500 |
+
audio_duration = info.duration
|
| 501 |
+
print(f"Got audio duration from soundfile / 從 soundfile 獲取音檔時長: {audio_duration:.2f}s")
|
| 502 |
+
else:
|
| 503 |
+
# 對於其他格式,使用 soundfile
|
| 504 |
+
info = sf.info(audio_source)
|
| 505 |
+
audio_duration = info.duration
|
| 506 |
+
print(f"Got audio duration from soundfile / 從 soundfile 獲取音檔時長: {audio_duration:.2f}s")
|
| 507 |
+
except Exception as e:
|
| 508 |
+
print(f"Could not get audio duration / 無法獲取音檔時長: {e}")
|
| 509 |
+
# 如果無法獲取時長,設置一個默認值
|
| 510 |
+
audio_duration = 0.0
|
| 511 |
+
print(f"Using default audio duration / 使用默認音檔時長: {audio_duration:.2f}s")
|
| 512 |
+
|
| 513 |
+
print(f"Processing with {current_model_name} from [{source_type_en} / {source_type_zh}]: {audio_source}")
|
| 514 |
+
print(f"Options: Task='{task}', Language(Source)='{language}', Timestamps='{return_timestamps}'")
|
| 515 |
+
if model_type == "phi4": print(f"Phi-4 Prompt: '{phi4_prompt_text}'")
|
| 516 |
+
|
| 517 |
+
status_update_str = f"Processing, please wait... / 正在處理,請稍候...\n(Model / 模型: {model_name_for_display})"
|
| 518 |
+
output_text_accumulated = status_update_prefix + status_update_str
|
| 519 |
+
|
| 520 |
+
# --- Execute & Timing ---
|
| 521 |
+
inference_start_time = time.monotonic()
|
| 522 |
+
current_run_output = ""
|
| 523 |
+
timing_info_str = ""
|
| 524 |
+
try:
|
| 525 |
+
if model_type == "phi4":
|
| 526 |
+
print("DEBUG: Processing with Phi-4...")
|
| 527 |
+
if not phi4_model or not phi4_processor: raise ValueError("Phi-4 model/processor not loaded / Phi-4 模型/處理器未載入")
|
| 528 |
+
if not phi4_prompt_text: raise ValueError("Phi-4 requires a prompt text / Phi-4 需要提示文字")
|
| 529 |
+
user_prompt_tag='<|user|>'
|
| 530 |
+
assistant_prompt_tag='<|assistant|>'
|
| 531 |
+
end_tag='<|end|>'
|
| 532 |
+
prompt = f"{user_prompt_tag}<|audio_1|>{phi4_prompt_text}{end_tag}{assistant_prompt_tag}"
|
| 533 |
+
audio_data, samplerate = sf.read(audio_source)
|
| 534 |
+
inputs = phi4_processor(text=prompt, audios=[(audio_data, samplerate)], return_tensors='pt').to(phi4_model.device)
|
| 535 |
+
with torch.no_grad(): generate_ids = phi4_model.generate(**inputs, max_new_tokens=500, num_logits_to_keep=0) # Added num_logits_to_keep=0
|
| 536 |
+
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
|
| 537 |
+
result_text = phi4_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
| 538 |
+
current_run_output = result_text.strip()
|
| 539 |
+
return_timestamps = False
|
| 540 |
+
else: # Whisper or other pipeline models
|
| 541 |
+
print("DEBUG: Processing with ASR pipeline...")
|
| 542 |
+
if not pipe: raise ValueError("ASR pipeline not loaded / ASR pipeline 未載入")
|
| 543 |
+
generate_kwargs_pipe = {"task": task}
|
| 544 |
+
|
| 545 |
+
# 根據任務處理語言參數
|
| 546 |
+
if task == "transcribe":
|
| 547 |
+
# 在轉錄任務中,language 表示源語言
|
| 548 |
+
if language != "auto":
|
| 549 |
+
generate_kwargs_pipe["language"] = language
|
| 550 |
+
print(f"DEBUG: Setting source language to {language} for transcription")
|
| 551 |
+
else: # translate
|
| 552 |
+
# 在翻譯任務中,Whisper 只支持翻譯為英文,所以我們忽略 language 參數
|
| 553 |
+
# 但我們仍然可以在日誌中記錄目標語言
|
| 554 |
+
print(f"DEBUG: Translation target language is {language}, but Whisper only supports English as target")
|
| 555 |
+
|
| 556 |
+
# 設置 pipeline 參數
|
| 557 |
+
pipeline_kwargs = {
|
| 558 |
+
"chunk_length_s": 30,
|
| 559 |
+
"batch_size": 1,
|
| 560 |
+
"return_timestamps": "chunks" if return_timestamps else False,
|
| 561 |
+
"generate_kwargs": generate_kwargs_pipe
|
| 562 |
+
}
|
| 563 |
+
|
| 564 |
+
# 使用 pipeline 調用處理音訊
|
| 565 |
+
# 注意:第一次運行時可能會出現 attention mask 警告,這是正常的,不影響功能
|
| 566 |
+
# 第二次及後續運行不會出現警告,且處理速度會更快
|
| 567 |
+
result = pipe(audio_source, **pipeline_kwargs)
|
| 568 |
+
|
| 569 |
+
print("DEBUG: pipe() call finished.")
|
| 570 |
+
print("DEBUG: Raw result type:", type(result))
|
| 571 |
+
print("DEBUG: Raw result content:", result)
|
| 572 |
+
|
| 573 |
+
# 處理不同格式的結果
|
| 574 |
+
if return_timestamps and isinstance(result, dict) and "chunks" in result:
|
| 575 |
+
formatted_chunks = [f"[{format_timestamp(chunk.get('timestamp', (None,))[0])} -> {format_timestamp(chunk.get('timestamp', (None, None))[1])}] {chunk.get('text', '').strip()}" for chunk in result["chunks"]]
|
| 576 |
+
current_run_output = "\n".join(formatted_chunks).strip()
|
| 577 |
+
elif isinstance(result, dict) and "text" in result:
|
| 578 |
+
current_run_output = result["text"].strip()
|
| 579 |
+
elif isinstance(result, str):
|
| 580 |
+
current_run_output = result.strip()
|
| 581 |
+
elif isinstance(result, list) and len(result) > 0 and isinstance(result[0], dict) and 'generated_text' in result[0]:
|
| 582 |
+
current_run_output = result[0]['generated_text'].strip()
|
| 583 |
+
else:
|
| 584 |
+
current_run_output = f"(Unrecognized result format / 無法識別的結果格式: {type(result)})"
|
| 585 |
+
|
| 586 |
+
print("DEBUG: Processed result:", current_run_output[:100] + "..." if len(current_run_output) > 100 else current_run_output)
|
| 587 |
+
|
| 588 |
+
inference_end_time = time.monotonic()
|
| 589 |
+
inference_time = inference_end_time - inference_start_time
|
| 590 |
+
if not current_run_output: current_run_output = "(Audio empty or unrecognizable / 音檔空白或無法辨識)"
|
| 591 |
+
|
| 592 |
+
# --- Format Timing Info (Plain Text, EN / ZH) ---
|
| 593 |
+
timing_info_str = f"Model / 模型: {model_name_for_display}\n"
|
| 594 |
+
if model_load_time > 0: timing_info_str += f"Model Load Time / 模型載入時間: {model_load_time:.2f} seconds / 秒\n"
|
| 595 |
+
timing_info_str += f"Inference Time / 推論時間: {inference_time:.2f} seconds / 秒\n"
|
| 596 |
+
relative_speed_str = "(relative time unavailable / 無法計算相對時間)"
|
| 597 |
+
if audio_duration is not None and audio_duration > 0:
|
| 598 |
+
relative_speed = inference_time / audio_duration
|
| 599 |
+
# Corrected format for relative speed
|
| 600 |
+
relative_speed_str = f"audio duration / 音檔長度 x {relative_speed:.2f}"
|
| 601 |
+
timing_info_str += f"audio duration / 音檔時長: {audio_duration:.2f} seconds / 秒\n"
|
| 602 |
+
timing_info_str += f"relative speed / 相對速度: {relative_speed_str}" # Corrected format
|
| 603 |
+
|
| 604 |
+
print(f"Processing finished. / 處理完成。")
|
| 605 |
+
print(timing_info_str.replace('\n', ' | '))
|
| 606 |
+
print(f"Result Text / 結果文字:\n{current_run_output}") # Print result text
|
| 607 |
+
|
| 608 |
+
# 確保 current_run_output 不為空
|
| 609 |
+
if not current_run_output or current_run_output.strip() == "":
|
| 610 |
+
current_run_output = "No text detected in audio / 音頻中未檢測到文字"
|
| 611 |
+
|
| 612 |
+
# 構建最終輸出文本,確保包含所有必要信息
|
| 613 |
+
final_output_text = ""
|
| 614 |
+
if status_update_prefix and status_update_prefix.strip():
|
| 615 |
+
final_output_text += status_update_prefix + "\n"
|
| 616 |
+
|
| 617 |
+
# 添加模型和時間信息
|
| 618 |
+
final_output_text += timing_info_str + "\n\n"
|
| 619 |
+
|
| 620 |
+
# 添加結果文本,並確保它被正確標記
|
| 621 |
+
final_output_text += "Result Text / 結果文字:\n" + current_run_output
|
| 622 |
+
|
| 623 |
+
# 確保最終輸出不是空的或只有一個點
|
| 624 |
+
final_output_text = final_output_text.strip()
|
| 625 |
+
if final_output_text == "." or not final_output_text:
|
| 626 |
+
final_output_text = timing_info_str + "\n\nResult Text / 結果文字:\n" + current_run_output
|
| 627 |
+
|
| 628 |
+
# 返回完整的文本結果,包括模型信息和處理時間
|
| 629 |
+
# 確保返回的是有效的文本,而不是單個點
|
| 630 |
+
if final_output_text == ".":
|
| 631 |
+
print("DEBUG: Detected dot-only output, fixing...")
|
| 632 |
+
# 構建更有意義的輸出
|
| 633 |
+
fixed_output = f"{timing_info_str}\n\nResult Text / 結果文字:\n{current_run_output}"
|
| 634 |
+
return fixed_output
|
| 635 |
+
return final_output_text
|
| 636 |
+
|
| 637 |
+
except Exception as e:
|
| 638 |
+
inference_end_time = time.monotonic()
|
| 639 |
+
inference_time = inference_end_time - inference_start_time
|
| 640 |
+
print(f"DEBUG: Exception occurred during processing / 處理過程中發生錯誤:")
|
| 641 |
+
traceback.print_exc()
|
| 642 |
+
error_message = f"Processing Failed / 處理失敗:\n{e}"
|
| 643 |
+
final_output_text = (status_update_prefix + error_message).strip()
|
| 644 |
+
timing_info_str = f"Model / 模型: {model_name_for_display}\n"
|
| 645 |
+
if model_load_time > 0: timing_info_str += f"Model Load Time / 模型載入時間: {model_load_time:.2f} seconds / 秒\n"
|
| 646 |
+
timing_info_str += f"Inference Time (until error) / 推論時間 (至錯誤): {inference_time:.2f} seconds / 秒\n"
|
| 647 |
+
timing_info_str += "Processing Failed / 處理失敗"
|
| 648 |
+
final_output_text += "\n\n" + timing_info_str
|
| 649 |
+
if isinstance(e, (MemoryError, RuntimeError)) and "out of memory" in str(e).lower(): final_output_text += "\n\nOut of Memory, try smaller model. / 記憶體不足,請用小模型."
|
| 650 |
+
|
| 651 |
+
finally:
|
| 652 |
+
if temp_dir_to_clean:
|
| 653 |
+
print(f"Cleaning YouTube temp files / 清理 YouTube 暫存: {temp_dir_to_clean}")
|
| 654 |
+
# Corrected finally block syntax
|
| 655 |
+
try:
|
| 656 |
+
shutil.rmtree(temp_dir_to_clean)
|
| 657 |
+
except Exception as e:
|
| 658 |
+
print(f"Failed to clean temp files / 清理暫存失敗: {e}")
|
| 659 |
+
|
| 660 |
+
print("DEBUG: Returning final result tuple...")
|
| 661 |
+
# Return final tuple: Update output_text, KEEP inputs by using gr.update()
|
| 662 |
+
# 如果 final_output_text 是字典(ASR pipeline 的輸出),直接返回它
|
| 663 |
+
# 否則,返回標準的元組格式
|
| 664 |
+
if isinstance(final_output_text, dict):
|
| 665 |
+
return final_output_text
|
| 666 |
+
else:
|
| 667 |
+
return (final_output_text, gr.update(), gr.update(), gr.update())
|
| 668 |
+
|
| 669 |
+
|
| 670 |
+
# --- UI Update Functions ---
|
| 671 |
+
# 添加一個函數來更新音頻播放器
|
| 672 |
+
def update_file_audio_player(file_path):
|
| 673 |
+
if file_path is None:
|
| 674 |
+
return gr.update(value=None, visible=False)
|
| 675 |
+
|
| 676 |
+
# 如果是列表,取第一個文件
|
| 677 |
+
if isinstance(file_path, list) and len(file_path) > 0:
|
| 678 |
+
file_path = file_path[0]
|
| 679 |
+
|
| 680 |
+
# 檢查文件是否存在
|
| 681 |
+
if not os.path.exists(file_path):
|
| 682 |
+
return gr.update(value=None, visible=False)
|
| 683 |
+
|
| 684 |
+
# 檢查是否為有效的音頻文件
|
| 685 |
+
valid_audio_extensions = ['.wav', '.mp3', '.ogg', '.flac', '.m4a', '.aac']
|
| 686 |
+
file_ext = os.path.splitext(file_path)[1].lower()
|
| 687 |
+
if file_ext not in valid_audio_extensions:
|
| 688 |
+
return gr.update(value=None, visible=False)
|
| 689 |
+
|
| 690 |
+
# 返回更新的音頻播放器
|
| 691 |
+
return gr.update(value=file_path, visible=True)
|
| 692 |
+
|
| 693 |
+
def update_task_choices(selected_model_id):
|
| 694 |
+
model_type = MODEL_INFO_DICT.get(selected_model_id, {}).get("type", "other")
|
| 695 |
+
if model_type == "whisper": new_choices = [ ("Transcribe / 轉錄", "transcribe"), ("Translate (Whisper only to English) / 翻譯 (Whisper 僅支援轉譯至英文)", "translate") ]
|
| 696 |
+
else: new_choices = [ ("Transcribe / 轉錄", "transcribe"), ("Translate / 轉譯", "translate") ]
|
| 697 |
+
return gr.update(choices=new_choices)
|
| 698 |
+
|
| 699 |
+
def update_phi4_prompt_ui(selected_model_id, task, language_code):
|
| 700 |
+
model_type = MODEL_INFO_DICT.get(selected_model_id, {}).get("type", "other")
|
| 701 |
+
is_phi4 = model_type == "phi4"
|
| 702 |
+
prompt_text = ""
|
| 703 |
+
if is_phi4:
|
| 704 |
+
if task == "transcribe":
|
| 705 |
+
if language_code == "auto":
|
| 706 |
+
prompt_text = "Transcribe the audio to text."
|
| 707 |
+
else:
|
| 708 |
+
lang_display_name = BILINGUAL_LANGUAGES_DICT.get(language_code, language_code)
|
| 709 |
+
lang_english_name = lang_display_name.split('/')[0].strip()
|
| 710 |
+
prompt_text = f"Transcribe the audio in {lang_english_name}."
|
| 711 |
+
elif task == "translate":
|
| 712 |
+
# 在翻譯任務中,language_code 表示目標語言
|
| 713 |
+
lang_display_name = BILINGUAL_LANGUAGES_DICT.get(language_code, language_code)
|
| 714 |
+
lang_english_name = lang_display_name.split('/')[0].strip()
|
| 715 |
+
if language_code == "auto" or language_code == "en":
|
| 716 |
+
# 如果目標語言是自動或英文,默認翻譯為英文
|
| 717 |
+
prompt_text = "Translate the audio to text."
|
| 718 |
+
else:
|
| 719 |
+
# 如果指定了目標語言,翻譯為該語言
|
| 720 |
+
prompt_text = f"Detect the language in the audio and translate it to {lang_english_name}."
|
| 721 |
+
# Return update for Textbox visibility and value directly
|
| 722 |
+
return gr.update(visible=is_phi4, value=prompt_text)
|
| 723 |
+
|
| 724 |
+
def update_language_choices(selected_model_id):
|
| 725 |
+
model_type = MODEL_INFO_DICT.get(selected_model_id, {}).get("type", "other")
|
| 726 |
+
if model_type == "phi4": return gr.update(choices=PHI4_LANGUAGES_LIST, value="auto")
|
| 727 |
+
else: return gr.update(choices=WHISPER_LANGUAGES_LIST, value="auto")
|
| 728 |
+
|
| 729 |
+
def update_timestamp_visibility(selected_model_id):
|
| 730 |
+
model_type = MODEL_INFO_DICT.get(selected_model_id, {}).get("type", "other")
|
| 731 |
+
print(f"DEBUG: Updating timestamp visibility for {selected_model_id}. Type: {model_type}. Visible: {model_type != 'phi4'}") # Debug print
|
| 732 |
+
return gr.update(visible=(model_type != "phi4"))
|
| 733 |
+
|
| 734 |
+
def update_language_ui(model_id, task):
|
| 735 |
+
"""根據模型和任務更新語言選擇器的標籤和可見性"""
|
| 736 |
+
model_type = MODEL_INFO_DICT.get(model_id, {}).get("type", "other")
|
| 737 |
+
|
| 738 |
+
# 如果是 Whisper 模型且任務是翻譯,則隱藏語言選擇器(因為 Whisper 只能翻譯成英文)
|
| 739 |
+
if model_type == "whisper" and task == "translate":
|
| 740 |
+
return gr.update(visible=False, label="Target Language / 目標語言")
|
| 741 |
+
|
| 742 |
+
# 否則,根據任務更新標籤並顯示
|
| 743 |
+
if task == "transcribe":
|
| 744 |
+
return gr.update(visible=True, label="Source Language / 來源語言")
|
| 745 |
+
else: # translate
|
| 746 |
+
return gr.update(visible=True, label="Target Language / 目標語言")
|
| 747 |
+
|
| 748 |
+
# --- Gradio Interface ---
|
| 749 |
+
# Preserving user's CSS choices
|
| 750 |
+
compact_css = """
|
| 751 |
+
.tabitem { margin: 0rem !important; padding: 0rem !important;}
|
| 752 |
+
.compact-file > div { min-height: unset !important; }
|
| 753 |
+
"""
|
| 754 |
+
|
| 755 |
+
# 移除 JavaScript 代碼,改用純 CSS 解決方案
|
| 756 |
+
|
| 757 |
+
with gr.Blocks(css=compact_css, theme=gr.themes.Default(spacing_size=gr.themes.sizes.spacing_sm, text_size=gr.themes.sizes.text_sm)) as demo:
|
| 758 |
+
# 只顯示標題,不顯示 GPU 狀態
|
| 759 |
+
gr.Markdown("# Automatic Speech Recognition(ASR) & Speech to Text(STT) / 語音辨識、語音轉文字 🔊🔄📝\nUse AI models to transcribe or translate speech from microphone, file uploads, or YouTube. / 使用 AI 模型轉錄或翻譯來自麥克風、上傳檔案或 YouTube 的語音。")
|
| 760 |
+
|
| 761 |
+
with gr.Row():
|
| 762 |
+
# Left Column: Input & Options
|
| 763 |
+
with gr.Column(scale=4): # Preserving user's scale
|
| 764 |
+
# 添加一個隱藏的狀態變量來跟踪當前活動的標籤
|
| 765 |
+
active_tab = gr.State(value="mic") # 默認為麥克風標籤
|
| 766 |
+
|
| 767 |
+
# 定義標籤切換函數
|
| 768 |
+
def set_active_tab(tab_name):
|
| 769 |
+
return tab_name
|
| 770 |
+
|
| 771 |
+
with gr.Tabs() as tabs:
|
| 772 |
+
with gr.TabItem("🎤 Microphone / 麥克風") as mic_tab:
|
| 773 |
+
gr.Markdown(MIC_PROMPT, elem_classes="compact-markdown")
|
| 774 |
+
mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio / 錄製音訊")
|
| 775 |
+
download_output = gr.File(label="Download Recording / 下載錄音檔", interactive=False, elem_classes="compact-file")
|
| 776 |
+
|
| 777 |
+
with gr.TabItem("📁 Upload File / 上傳檔案") as file_tab:
|
| 778 |
+
# 使用 File 組件代替 Audio 組件,避免音頻處理問題
|
| 779 |
+
file_input = gr.File(label="Upload Audio File / 上傳音訊檔", file_types=["audio"], type="filepath")
|
| 780 |
+
|
| 781 |
+
# 添加音頻播放器
|
| 782 |
+
file_audio_player = gr.Audio(label="Audio Preview / 音訊預覽", interactive=False, visible=False)
|
| 783 |
+
|
| 784 |
+
with gr.TabItem("▶️ YouTube") as youtube_tab:
|
| 785 |
+
youtube_input = gr.Textbox(label="YouTube URL / 網址", placeholder="Paste YouTube link here / 在此貼上 YouTube 連結")
|
| 786 |
+
gr.Examples(examples=[["https://www.youtube.com/watch?v=5D7l0tqQJ7k"]], inputs=[youtube_input], label="Example YouTube URL / 範例 YouTube 網址")
|
| 787 |
+
|
| 788 |
+
# 添加 YouTube 音訊播放器和下載按鈕
|
| 789 |
+
with gr.Row():
|
| 790 |
+
youtube_audio_player = gr.Audio(label="YouTube Audio / YouTube 音訊", interactive=False, visible=False)
|
| 791 |
+
youtube_download = gr.File(label="Download YouTube Audio / 下載 YouTube 音訊", interactive=False, visible=False, elem_classes="compact-file")
|
| 792 |
+
|
| 793 |
+
# 添加標籤切換事件
|
| 794 |
+
mic_tab.select(fn=lambda: set_active_tab("mic"), inputs=[], outputs=[active_tab])
|
| 795 |
+
file_tab.select(fn=lambda: set_active_tab("file"), inputs=[], outputs=[active_tab])
|
| 796 |
+
youtube_tab.select(fn=lambda: set_active_tab("youtube"), inputs=[], outputs=[active_tab])
|
| 797 |
+
|
| 798 |
+
# Options in a single column with ID for spacing
|
| 799 |
+
with gr.Column(elem_id="options-block"): # elem_id for CSS targeting if needed
|
| 800 |
+
model_select = gr.Dropdown(choices=MODEL_CHOICES_WITH_PARAMS, label="Model / 模型", value=DEFAULT_MODEL, elem_classes="compact-label")
|
| 801 |
+
# 獲取硬體信息並顯示具體的 CPU 和 GPU 型號
|
| 802 |
+
cpu_info, gpu_info = get_hardware_info()
|
| 803 |
+
device_choices = [(f"CPU ({cpu_info})", "cpu")]
|
| 804 |
+
if torch.cuda.is_available() and gpu_info:
|
| 805 |
+
device_choices.append((f"GPU ({gpu_info})", "gpu"))
|
| 806 |
+
device_input = gr.Radio(choices=device_choices, label="Device / 設備", value="cpu", elem_classes="compact-label radio-align")
|
| 807 |
+
task_input = gr.Radio(choices=[("Transcribe / 轉錄", "transcribe"), ("Translate / 轉譯", "translate")], label="Task / 任務", value="transcribe", elem_classes="compact-label radio-align")
|
| 808 |
+
language_input = gr.Dropdown(choices=WHISPER_LANGUAGES_LIST, label="Source Language / 來源語言", value="auto", elem_classes="compact-label")
|
| 809 |
+
# Phi-4 prompt directly in the column, no Accordion
|
| 810 |
+
phi4_prompt_input = gr.Textbox(label="Only for Phi-4 Prompt / 僅用於 Phi-4 指令", placeholder="e.g., Transcribe the audio to text.", lines=1, visible=False, elem_classes="compact-label") # Preserving user label and params
|
| 811 |
+
timestamp_input = gr.Checkbox(label="Show Timestamps / 顯示時間戳", value=False, elem_classes="compact-label checkbox-align") # Preserving user label
|
| 812 |
+
|
| 813 |
+
# Right Column: Output
|
| 814 |
+
with gr.Column(scale=6): # Preserving user's scale
|
| 815 |
+
submit_button = gr.Button("Submit / 提交", variant="primary") # Preserving user's text and placement
|
| 816 |
+
output_text = gr.Textbox(
|
| 817 |
+
label="Result / 結果",
|
| 818 |
+
lines=25, # 設置顯示的行數
|
| 819 |
+
max_lines=25, # 設置最大行數,超過會顯示滾動條
|
| 820 |
+
interactive=True,
|
| 821 |
+
placeholder="Results appear here (new results appended). / 結果將顯示在此 (新結果會附加在後面)",
|
| 822 |
+
elem_classes="result-textbox", # 保留 CSS 類
|
| 823 |
+
autoscroll=False # 不自動滾動到底部,讓用戶可以控制滾動
|
| 824 |
+
)
|
| 825 |
+
|
| 826 |
+
# --- Event Listeners ---
|
| 827 |
+
model_select.change(fn=update_language_choices, inputs=model_select, outputs=language_input)
|
| 828 |
+
model_select.change(fn=update_task_choices, inputs=[model_select], outputs=[task_input])
|
| 829 |
+
# Link prompt update function correctly
|
| 830 |
+
model_select.change(fn=update_phi4_prompt_ui, inputs=[model_select, task_input, language_input], outputs=[phi4_prompt_input])
|
| 831 |
+
task_input.change(fn=update_phi4_prompt_ui, inputs=[model_select, task_input, language_input], outputs=[phi4_prompt_input])
|
| 832 |
+
language_input.change(fn=update_phi4_prompt_ui, inputs=[model_select, task_input, language_input], outputs=[phi4_prompt_input])
|
| 833 |
+
|
| 834 |
+
# 根據模型和任務更新語言選擇器
|
| 835 |
+
task_input.change(fn=update_language_ui, inputs=[model_select, task_input], outputs=language_input)
|
| 836 |
+
model_select.change(fn=update_language_ui, inputs=[model_select, task_input], outputs=language_input)
|
| 837 |
+
# Link timestamp visibility function
|
| 838 |
+
model_select.change(fn=update_timestamp_visibility, inputs=model_select, outputs=timestamp_input)
|
| 839 |
+
|
| 840 |
+
# 連接下載按鈕功能
|
| 841 |
+
mic_input.change(fn=update_download_file, inputs=mic_input, outputs=download_output)
|
| 842 |
+
|
| 843 |
+
# 連接文件上傳音頻播放器
|
| 844 |
+
file_input.change(fn=update_file_audio_player, inputs=file_input, outputs=file_audio_player)
|
| 845 |
+
|
| 846 |
+
# 連接 YouTube 處理功能
|
| 847 |
+
youtube_input.change(
|
| 848 |
+
fn=process_youtube_url,
|
| 849 |
+
inputs=youtube_input,
|
| 850 |
+
outputs=[youtube_audio_player, youtube_download],
|
| 851 |
+
show_progress=True
|
| 852 |
+
)
|
| 853 |
+
|
| 854 |
+
# 添加一個函數來直接調用 transcribe_audio,不使用 yield
|
| 855 |
+
def transcribe_audio_with_error_handling(*args):
|
| 856 |
+
try:
|
| 857 |
+
# 獲取模型信息
|
| 858 |
+
selected_model_identifier = args[3] # 第四個參數是 selected_model_identifier
|
| 859 |
+
model_name_for_display = selected_model_identifier
|
| 860 |
+
|
| 861 |
+
# 獲取音頻時長,用於估算處理時間
|
| 862 |
+
audio_source = None
|
| 863 |
+
active_tab = args[-1] # 最後一個參數是 active_tab
|
| 864 |
+
|
| 865 |
+
if active_tab == "mic" and args[0] is not None:
|
| 866 |
+
audio_source = args[0]
|
| 867 |
+
elif active_tab == "file" and args[1] is not None:
|
| 868 |
+
if isinstance(args[1], list) and len(args[1]) > 0:
|
| 869 |
+
audio_source = args[1][0]
|
| 870 |
+
else:
|
| 871 |
+
audio_source = args[1]
|
| 872 |
+
elif active_tab == "youtube" and args[2] and args[2].strip():
|
| 873 |
+
# YouTube 處理較複雜,暫不估算時間
|
| 874 |
+
pass
|
| 875 |
+
|
| 876 |
+
# 檢查音頻文件是否存在
|
| 877 |
+
if audio_source and os.path.exists(audio_source):
|
| 878 |
+
print(f"Processing audio file: {audio_source}")
|
| 879 |
+
|
| 880 |
+
# 清除之前的輸出,確保結果顯示正確
|
| 881 |
+
print("\n" + "="*50)
|
| 882 |
+
print("NEW TRANSCRIPTION PROCESS STARTED")
|
| 883 |
+
print("="*50 + "\n")
|
| 884 |
+
|
| 885 |
+
# 開始計時
|
| 886 |
+
start_time = time.time()
|
| 887 |
+
|
| 888 |
+
# 直接調用 transcribe_audio 函數
|
| 889 |
+
result = transcribe_audio(*args)
|
| 890 |
+
|
| 891 |
+
# 處理完成
|
| 892 |
+
elapsed_time = time.time() - start_time
|
| 893 |
+
|
| 894 |
+
# 處理結果(現在應該是文本字符串)
|
| 895 |
+
print("DEBUG: Result type:", type(result))
|
| 896 |
+
print("DEBUG: Final result:", result)
|
| 897 |
+
|
| 898 |
+
# 檢查結果是否為字符串
|
| 899 |
+
if isinstance(result, str):
|
| 900 |
+
if result.strip() == ".":
|
| 901 |
+
# 如果結果只是一個點 ".",這是一個已知問題
|
| 902 |
+
print("DEBUG: Detected dot-only output in handler, fixing...")
|
| 903 |
+
|
| 904 |
+
# 從控制台輸出中提取最後一個處理結果
|
| 905 |
+
# 這是一個臨時解決方案
|
| 906 |
+
model_info = f"Model / 模型: {model_name_for_display}"
|
| 907 |
+
inference_time_info = f"Processing Time / 處理時間: {elapsed_time:.2f} seconds / 秒"
|
| 908 |
+
|
| 909 |
+
# 嘗試從控制台日誌中提取結果文本
|
| 910 |
+
# 這裡我們假設結果已經在控制台中打印出來了
|
| 911 |
+
final_text = f"{model_info}\n{inference_time_info}\n\nResult Text / 結果文字:\n"
|
| 912 |
+
final_text += "(Please check console for complete transcription / 請查看控制台獲取完整轉錄)"
|
| 913 |
+
|
| 914 |
+
print("DEBUG: Created replacement result:", final_text[:100] + "..." if len(final_text) > 100 else final_text)
|
| 915 |
+
else:
|
| 916 |
+
# 正常結果,直接使用
|
| 917 |
+
final_text = result
|
| 918 |
+
print("DEBUG: Using original result text")
|
| 919 |
+
else:
|
| 920 |
+
# 如果結果不是字符串,創建一個新的結果字符串
|
| 921 |
+
final_text = f"Model / 模型: {model_name_for_display}\n"
|
| 922 |
+
final_text += f"Processing Time / 處理時間: {elapsed_time:.2f} seconds / 秒\n\n"
|
| 923 |
+
final_text += "(No text detected in audio / 音頻中未檢測到文字)"
|
| 924 |
+
print("DEBUG: Created new result for non-string:", final_text[:100] + "..." if len(final_text) > 100 else final_text)
|
| 925 |
+
|
| 926 |
+
return final_text, gr.update(), gr.update(), gr.update()
|
| 927 |
+
except Exception as e:
|
| 928 |
+
import traceback
|
| 929 |
+
error_msg = f"Error during processing: {str(e)}\n\n{traceback.format_exc()}"
|
| 930 |
+
print(error_msg)
|
| 931 |
+
|
| 932 |
+
# 返回錯誤訊息,保持其他輸出不變
|
| 933 |
+
return f"處理過程中發生錯誤 / Error during processing:\n{str(e)}", gr.update(), gr.update(), gr.update()
|
| 934 |
+
|
| 935 |
+
# Main submit action - Corrected outputs list
|
| 936 |
+
submit_button.click(
|
| 937 |
+
fn=transcribe_audio_with_error_handling,
|
| 938 |
+
inputs=[mic_input, file_input, youtube_input, model_select, task_input, language_input, timestamp_input, phi4_prompt_input, device_input, output_text, active_tab],
|
| 939 |
+
outputs=[output_text, mic_input, file_input, youtube_input], # 保持原始輸出
|
| 940 |
+
show_progress="full" # 顯示完整進度條
|
| 941 |
+
)
|
| 942 |
+
|
| 943 |
+
|
| 944 |
+
|
| 945 |
+
# --- Launch App ---
|
| 946 |
+
if __name__ == "__main__":
|
| 947 |
+
# 獲取硬體信息
|
| 948 |
+
cpu_info, gpu_info = get_hardware_info()
|
| 949 |
+
has_gpu = gpu_info is not None
|
| 950 |
+
|
| 951 |
+
print(f"CPU: {cpu_info}")
|
| 952 |
+
if has_gpu:
|
| 953 |
+
print(f"GPU: {gpu_info}")
|
| 954 |
+
else:
|
| 955 |
+
print("No GPU detected")
|
| 956 |
+
|
| 957 |
+
# REMEMBER: Update requirements.txt with accelerate, scipy, torchvision, peft
|
| 958 |
+
demo.launch(
|
| 959 |
+
debug=True,
|
| 960 |
+
max_threads=4, # 減少最大線程數,提高穩定性
|
| 961 |
+
show_error=True, # 顯示錯誤詳情
|
| 962 |
+
server_name="127.0.0.1", # 本地運行
|
| 963 |
+
server_port=7860, # 指定端口
|
| 964 |
+
quiet=False, # 顯示所有日誌
|
| 965 |
+
prevent_thread_lock=True # 防止線程鎖定
|
| 966 |
+
)
|
assets/audio.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1b0481618037c60e33386078b873bbc90d4d28af8e07cf11da5aec2031645a49
|
| 3 |
+
size 1444758
|
assets/sample_audio.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d2a7e73ccdc15b77808aa28d8ea7d6c86bae49f9d051d6d5d843faf1fb40c834
|
| 3 |
+
size 80640
|
pyproject.toml
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tool.pyright]
|
| 2 |
+
include = ["app.py"]
|
| 3 |
+
exclude = ["**/node_modules", "**/__pycache__", "**/env", "**/env_new", "**/envsource"]
|
| 4 |
+
reportMissingImports = false
|
| 5 |
+
reportGeneralTypeIssues = false
|
| 6 |
+
reportOptionalMemberAccess = false
|
| 7 |
+
reportOptionalSubscript = false
|
| 8 |
+
reportOptionalCall = false
|
| 9 |
+
reportOptionalIterable = false
|
| 10 |
+
reportOptionalContextManager = false
|
| 11 |
+
reportOptionalOperand = false
|
pyrightconfig.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"venvPath": ".",
|
| 3 |
+
"venv": "env",
|
| 4 |
+
"reportMissingImports": false
|
| 5 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Main packages for Hugging Face Spaces
|
| 2 |
+
gradio>=5.0.0
|
| 3 |
+
transformers>=4.30.0
|
| 4 |
+
huggingface-hub>=0.15.0
|
| 5 |
+
torch>=2.0.0
|
| 6 |
+
torchvision>=0.15.0
|
| 7 |
+
torchaudio>=2.0.0
|
| 8 |
+
|
| 9 |
+
# Acceleration and optimization
|
| 10 |
+
accelerate>=1.0.0
|
| 11 |
+
safetensors>=0.3.0
|
| 12 |
+
|
| 13 |
+
# Audio processing
|
| 14 |
+
yt-dlp>=2023.0.0
|
| 15 |
+
soundfile>=0.12.0
|
| 16 |
+
pydub>=0.25.0
|
| 17 |
+
|
| 18 |
+
# Data processing
|
| 19 |
+
numpy>=2.0.0
|
| 20 |
+
scipy>=1.0.0 # Needed by Phi-4
|
| 21 |
+
peft>=0.5.0 # Needed by Phi-4
|
| 22 |
+
backoff>=2.0.0 # Needed by Phi-4
|
requirements_local.txt
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Main packages
|
| 2 |
+
gradio>=5.0.0
|
| 3 |
+
transformers>=4.30.0
|
| 4 |
+
huggingface-hub>=0.15.0
|
| 5 |
+
|
| 6 |
+
# PyTorch with CUDA support - these will be installed from the specified URL
|
| 7 |
+
--extra-index-url https://download.pytorch.org/whl/cu126
|
| 8 |
+
torch==2.7.0+cu126
|
| 9 |
+
torchvision==0.22.0+cu126
|
| 10 |
+
torchaudio==2.7.0+cu126
|
| 11 |
+
|
| 12 |
+
# Acceleration and optimization
|
| 13 |
+
accelerate>=1.0.0
|
| 14 |
+
safetensors>=0.3.0
|
| 15 |
+
|
| 16 |
+
# Audio processing
|
| 17 |
+
yt-dlp>=2023.0.0
|
| 18 |
+
soundfile>=0.12.0
|
| 19 |
+
pydub>=0.25.0
|
| 20 |
+
|
| 21 |
+
# Data processing
|
| 22 |
+
numpy>=2.0.0
|
| 23 |
+
scipy>=1.0.0 # Needed by Phi-4
|
| 24 |
+
peft>=0.5.0 # Needed by Phi-4
|
| 25 |
+
backoff>=2.0.0 # Needed by Phi-4
|
| 26 |
+
|
| 27 |
+
# Important dependencies that might need specific versions
|
| 28 |
+
typing_extensions>=4.10.0
|
| 29 |
+
filelock>=3.0.0
|
| 30 |
+
fsspec>=2024.0.0
|