royyy74 commited on
Commit
6655b26
·
verified ·
1 Parent(s): 25637f7

Upload 4 files

Browse files
Files changed (4) hide show
  1. .dockerignore +70 -0
  2. Dockerfile +36 -0
  3. app.py +352 -0
  4. requirements.txt +10 -0
.dockerignore ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+ MANIFEST
23
+
24
+ # Virtual environments
25
+ .env
26
+ .venv
27
+ env/
28
+ venv/
29
+ ENV/
30
+ env.bak/
31
+ venv.bak/
32
+
33
+ # IDE
34
+ .vscode/
35
+ .idea/
36
+ *.swp
37
+ *.swo
38
+ *~
39
+
40
+ # OS
41
+ .DS_Store
42
+ .DS_Store?
43
+ ._*
44
+ .Spotlight-V100
45
+ .Trashes
46
+ ehthumbs.db
47
+ Thumbs.db
48
+
49
+ # Git
50
+ .git/
51
+ .gitignore
52
+
53
+ # Documentation
54
+ README.md
55
+ *.md
56
+
57
+ # Logs
58
+ *.log
59
+ logs/
60
+
61
+ # Test files
62
+ test_*
63
+ *_test.py
64
+ tests/
65
+
66
+ # Audio cache files
67
+ *.wav
68
+ *.mp3
69
+ *.ogg
70
+ audio_cache/
Dockerfile ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies for audio processing
7
+ RUN apt-get update && apt-get install -y \
8
+ gcc \
9
+ g++ \
10
+ libsndfile1 \
11
+ ffmpeg \
12
+ curl \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ # Copy requirements first for better caching
16
+ COPY requirements.txt .
17
+
18
+ # Install Python dependencies
19
+ RUN pip install --no-cache-dir -r requirements.txt
20
+
21
+ # Copy application code
22
+ COPY . .
23
+
24
+ # Create non-root user for security
25
+ RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
26
+ USER appuser
27
+
28
+ # Expose port
29
+ EXPOSE 7860
30
+
31
+ # Health check
32
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
33
+ CMD curl -f http://localhost:7860/health || exit 1
34
+
35
+ # Run the application
36
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import time
4
+ import tempfile
5
+ import base64
6
+ from datetime import datetime
7
+ from typing import Dict, Optional
8
+ from fastapi import FastAPI, HTTPException
9
+ from fastapi.responses import FileResponse
10
+ from pydantic import BaseModel
11
+ import torch
12
+ import torchaudio
13
+ from TTS.api import TTS
14
+ import librosa
15
+ import soundfile as sf
16
+ from loguru import logger
17
+
18
+ # Pydantic models for API
19
+ class TTSRequest(BaseModel):
20
+ text: str
21
+ language: str = "id" # Indonesian
22
+ voice_style: str = "kawaii"
23
+ speed: float = 1.0
24
+ format: str = "wav"
25
+
26
+ class TTSResponse(BaseModel):
27
+ audio_url: str
28
+ format: str
29
+ duration: float
30
+ processing_time: float
31
+
32
+ # Initialize FastAPI app
33
+ app = FastAPI(
34
+ title="Indonesian TTS Service",
35
+ description="Text-to-Speech service for Indonesian VTuber with kawaii voice",
36
+ version="1.0.0"
37
+ )
38
+
39
+ # Global TTS model
40
+ tts_model = None
41
+ temp_dir = tempfile.mkdtemp()
42
+
43
+ # Voice style configurations
44
+ VOICE_STYLES = {
45
+ "kawaii": {
46
+ "pitch_shift": 4, # Higher pitch for cute voice
47
+ "speed_factor": 0.9, # Slightly slower
48
+ "emotion": "happy"
49
+ },
50
+ "friendly": {
51
+ "pitch_shift": 2,
52
+ "speed_factor": 1.0,
53
+ "emotion": "neutral"
54
+ },
55
+ "professional": {
56
+ "pitch_shift": 0,
57
+ "speed_factor": 1.1,
58
+ "emotion": "neutral"
59
+ }
60
+ }
61
+
62
+ def load_tts_model():
63
+ """Load TTS model"""
64
+ global tts_model
65
+
66
+ try:
67
+ # Try to load Indonesian TTS model
68
+ # Using Coqui TTS with multilingual model
69
+ model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
70
+
71
+ logger.info(f"Loading TTS model: {model_name}")
72
+ tts_model = TTS(model_name)
73
+
74
+ # Set to use CPU for Hugging Face free tier
75
+ if torch.cuda.is_available():
76
+ tts_model = tts_model.to("cuda")
77
+ logger.info("Using GPU for TTS")
78
+ else:
79
+ logger.info("Using CPU for TTS")
80
+
81
+ return True
82
+
83
+ except Exception as e:
84
+ logger.error(f"Failed to load TTS model: {e}")
85
+
86
+ # Fallback to simpler model
87
+ try:
88
+ model_name = "tts_models/en/ljspeech/tacotron2-DDC"
89
+ tts_model = TTS(model_name)
90
+ logger.info(f"Loaded fallback TTS model: {model_name}")
91
+ return True
92
+ except Exception as e2:
93
+ logger.error(f"Failed to load fallback model: {e2}")
94
+ return False
95
+
96
+ def apply_voice_style(audio_data: torch.Tensor, sample_rate: int, style: str) -> torch.Tensor:
97
+ """Apply voice style modifications"""
98
+ try:
99
+ style_config = VOICE_STYLES.get(style, VOICE_STYLES["friendly"])
100
+
101
+ # Convert to numpy for librosa processing
102
+ audio_np = audio_data.numpy()
103
+
104
+ # Apply pitch shift for kawaii effect
105
+ if style_config["pitch_shift"] != 0:
106
+ audio_np = librosa.effects.pitch_shift(
107
+ audio_np,
108
+ sr=sample_rate,
109
+ n_steps=style_config["pitch_shift"]
110
+ )
111
+
112
+ # Apply speed change
113
+ if style_config["speed_factor"] != 1.0:
114
+ audio_np = librosa.effects.time_stretch(
115
+ audio_np,
116
+ rate=style_config["speed_factor"]
117
+ )
118
+
119
+ # Convert back to tensor
120
+ return torch.from_numpy(audio_np)
121
+
122
+ except Exception as e:
123
+ logger.warning(f"Failed to apply voice style: {e}")
124
+ return audio_data
125
+
126
+ def enhance_indonesian_pronunciation(text: str) -> str:
127
+ """Enhance text for better Indonesian pronunciation"""
128
+
129
+ # Common Indonesian pronunciation fixes
130
+ replacements = {
131
+ # English words commonly used in Indonesian beauty context
132
+ "skincare": "skin-ker",
133
+ "makeup": "mek-ap",
134
+ "foundation": "faun-de-syen",
135
+ "concealer": "kon-si-ler",
136
+ "moisturizer": "mois-tu-rai-zer",
137
+ "cleanser": "klen-ser",
138
+ "toner": "to-ner",
139
+ "serum": "se-rum",
140
+ "sunscreen": "san-skrin",
141
+
142
+ # Beauty terms
143
+ "anti-aging": "anti-e-jing",
144
+ "oil-free": "oil-fri",
145
+ "non-comedogenic": "non-ko-me-do-je-nik",
146
+
147
+ # Common expressions
148
+ "guys": "gais",
149
+ "review": "ri-vyu",
150
+ "tutorial": "tu-to-ri-al",
151
+ "tips": "tips",
152
+ "routine": "ru-tin"
153
+ }
154
+
155
+ enhanced_text = text
156
+ for original, replacement in replacements.items():
157
+ enhanced_text = enhanced_text.replace(original, replacement)
158
+
159
+ return enhanced_text
160
+
161
+ def add_kawaii_expressions(text: str) -> str:
162
+ """Add kawaii expressions to text"""
163
+
164
+ # Add cute expressions
165
+ kawaii_additions = {
166
+ "!": " nya!",
167
+ "?": " kah?",
168
+ ".": " desu~",
169
+ }
170
+
171
+ # Don't overdo it - only add to some sentences
172
+ if len(text) > 50 and "nya" not in text.lower():
173
+ # Add nya to exclamations
174
+ text = text.replace("!", " nya!")
175
+
176
+ return text
177
+
178
+ @app.on_event("startup")
179
+ async def startup_event():
180
+ """Initialize TTS model on startup"""
181
+ logger.info("Starting Indonesian TTS Service...")
182
+
183
+ success = load_tts_model()
184
+ if not success:
185
+ logger.error("Failed to load TTS model during startup")
186
+ else:
187
+ logger.info("TTS Service ready!")
188
+
189
+ @app.get("/health")
190
+ async def health_check():
191
+ """Health check endpoint"""
192
+ return {
193
+ "status": "healthy",
194
+ "model_loaded": tts_model is not None,
195
+ "available_styles": list(VOICE_STYLES.keys()),
196
+ "timestamp": datetime.now().isoformat()
197
+ }
198
+
199
+ @app.post("/synthesize", response_model=TTSResponse)
200
+ async def synthesize_speech(request: TTSRequest):
201
+ """Main TTS synthesis endpoint"""
202
+ start_time = time.time()
203
+
204
+ try:
205
+ if tts_model is None:
206
+ raise HTTPException(status_code=503, detail="TTS model not loaded")
207
+
208
+ # Enhance text for Indonesian pronunciation
209
+ enhanced_text = enhance_indonesian_pronunciation(request.text)
210
+
211
+ # Add kawaii expressions if requested
212
+ if request.voice_style == "kawaii":
213
+ enhanced_text = add_kawaii_expressions(enhanced_text)
214
+
215
+ logger.info(f"Synthesizing: {enhanced_text[:100]}...")
216
+
217
+ # Generate audio
218
+ # Create temporary output file
219
+ output_file = os.path.join(temp_dir, f"tts_{int(time.time())}.wav")
220
+
221
+ # Synthesize speech
222
+ if hasattr(tts_model, 'tts_to_file'):
223
+ # For XTTS model with speaker cloning
224
+ tts_model.tts_to_file(
225
+ text=enhanced_text,
226
+ file_path=output_file,
227
+ speaker_wav=None, # Use default speaker
228
+ language="id" if request.language == "id" else "en"
229
+ )
230
+ else:
231
+ # For simpler models
232
+ tts_model.tts_to_file(
233
+ text=enhanced_text,
234
+ file_path=output_file
235
+ )
236
+
237
+ # Load and process audio
238
+ audio_data, sample_rate = torchaudio.load(output_file)
239
+
240
+ # Apply voice style
241
+ if request.voice_style in VOICE_STYLES:
242
+ audio_data = apply_voice_style(audio_data, sample_rate, request.voice_style)
243
+
244
+ # Apply speed adjustment
245
+ if request.speed != 1.0:
246
+ audio_np = audio_data.numpy()
247
+ audio_np = librosa.effects.time_stretch(audio_np, rate=request.speed)
248
+ audio_data = torch.from_numpy(audio_np)
249
+
250
+ # Save processed audio
251
+ processed_file = os.path.join(temp_dir, f"processed_{int(time.time())}.{request.format}")
252
+
253
+ if request.format.lower() == "wav":
254
+ torchaudio.save(processed_file, audio_data, sample_rate)
255
+ else:
256
+ # Convert to requested format using soundfile
257
+ sf.write(processed_file, audio_data.numpy().T, sample_rate)
258
+
259
+ # Calculate duration
260
+ duration = audio_data.shape[1] / sample_rate
261
+ processing_time = time.time() - start_time
262
+
263
+ # For Hugging Face Spaces, we need to return a URL
264
+ # In production, this would be uploaded to cloud storage
265
+ # For now, we'll return the file path as URL
266
+ audio_url = f"/audio/{os.path.basename(processed_file)}"
267
+
268
+ # Clean up original file
269
+ try:
270
+ os.remove(output_file)
271
+ except:
272
+ pass
273
+
274
+ logger.info(f"TTS synthesis completed in {processing_time:.2f}s")
275
+
276
+ return TTSResponse(
277
+ audio_url=audio_url,
278
+ format=request.format,
279
+ duration=duration,
280
+ processing_time=processing_time
281
+ )
282
+
283
+ except Exception as e:
284
+ logger.error(f"TTS synthesis error: {e}")
285
+ raise HTTPException(status_code=500, detail=str(e))
286
+
287
+ @app.get("/audio/{filename}")
288
+ async def get_audio_file(filename: str):
289
+ """Serve generated audio files"""
290
+ file_path = os.path.join(temp_dir, filename)
291
+
292
+ if not os.path.exists(file_path):
293
+ raise HTTPException(status_code=404, detail="Audio file not found")
294
+
295
+ return FileResponse(
296
+ file_path,
297
+ media_type="audio/wav",
298
+ filename=filename
299
+ )
300
+
301
+ @app.post("/test")
302
+ async def test_synthesis():
303
+ """Test endpoint for quick TTS testing"""
304
+ test_text = "Halo semuanya! Selamat datang di live shopping kami! Hari ini ada banyak produk kecantikan yang bagus banget!"
305
+
306
+ request = TTSRequest(
307
+ text=test_text,
308
+ voice_style="kawaii",
309
+ speed=1.0
310
+ )
311
+
312
+ return await synthesize_speech(request)
313
+
314
+ @app.get("/styles")
315
+ async def get_voice_styles():
316
+ """Get available voice styles"""
317
+ return {
318
+ "styles": VOICE_STYLES,
319
+ "default": "kawaii",
320
+ "description": {
321
+ "kawaii": "Cute, high-pitched voice perfect for VTuber",
322
+ "friendly": "Warm and approachable voice",
323
+ "professional": "Clear and professional voice"
324
+ }
325
+ }
326
+
327
+ @app.get("/")
328
+ async def root():
329
+ """Root endpoint"""
330
+ return {
331
+ "service": "Indonesian TTS Service",
332
+ "version": "1.0.0",
333
+ "status": "running",
334
+ "supported_languages": ["id", "en"],
335
+ "endpoints": ["/synthesize", "/test", "/styles", "/health"]
336
+ }
337
+
338
+ # Cleanup function
339
+ @app.on_event("shutdown")
340
+ async def shutdown_event():
341
+ """Cleanup on shutdown"""
342
+ try:
343
+ # Clean up temporary files
344
+ import shutil
345
+ shutil.rmtree(temp_dir, ignore_errors=True)
346
+ logger.info("TTS Service shutdown completed")
347
+ except Exception as e:
348
+ logger.error(f"Shutdown error: {e}")
349
+
350
+ if __name__ == "__main__":
351
+ import uvicorn
352
+ uvicorn.run(app, host="0.0.0.0", port=7860) # Hugging Face Spaces default port
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn==0.24.0
3
+ TTS==0.20.6
4
+ torch==2.1.1
5
+ torchaudio==2.1.1
6
+ librosa==0.10.1
7
+ soundfile==0.12.1
8
+ pydantic==2.5.0
9
+ loguru==0.7.2
10
+ numpy==1.24.3