Spaces:

royyy74
/

beauty-vtuber-tts

Sleeping

App Files Files Community

royyy74 commited on 25 days ago

Commit

6655b26

verified ·

1 Parent(s): 25637f7

Upload 4 files

Browse files

Files changed (4) hide show

.dockerignore +70 -0
Dockerfile +36 -0
app.py +352 -0
requirements.txt +10 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,70 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Virtual environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+# Git
+.git/
+.gitignore
+# Documentation
+README.md
+*.md
+# Logs
+*.log
+logs/
+# Test files
+test_*
+*_test.py
+tests/
+# Audio cache files
+*.wav
+*.mp3
+*.ogg
+audio_cache/

Dockerfile ADDED Viewed

	@@ -0,0 +1,36 @@

+FROM python:3.9-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies for audio processing
+RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
+    libsndfile1 \
+    ffmpeg \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for better caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Create non-root user for security
+RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
+USER appuser
+# Expose port
+EXPOSE 7860
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:7860/health || exit 1
+# Run the application
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import os
+import io
+import time
+import tempfile
+import base64
+from datetime import datetime
+from typing import Dict, Optional
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import FileResponse
+from pydantic import BaseModel
+import torch
+import torchaudio
+from TTS.api import TTS
+import librosa
+import soundfile as sf
+from loguru import logger
+# Pydantic models for API
+class TTSRequest(BaseModel):
+    text: str
+    language: str = "id"  # Indonesian
+    voice_style: str = "kawaii"
+    speed: float = 1.0
+    format: str = "wav"
+class TTSResponse(BaseModel):
+    audio_url: str
+    format: str
+    duration: float
+    processing_time: float
+# Initialize FastAPI app
+app = FastAPI(
+    title="Indonesian TTS Service",
+    description="Text-to-Speech service for Indonesian VTuber with kawaii voice",
+    version="1.0.0"
+)
+# Global TTS model
+tts_model = None
+temp_dir = tempfile.mkdtemp()
+# Voice style configurations
+VOICE_STYLES = {
+    "kawaii": {
+        "pitch_shift": 4,  # Higher pitch for cute voice
+        "speed_factor": 0.9,  # Slightly slower
+        "emotion": "happy"
+    },
+    "friendly": {
+        "pitch_shift": 2,
+        "speed_factor": 1.0,
+        "emotion": "neutral"
+    },
+    "professional": {
+        "pitch_shift": 0,
+        "speed_factor": 1.1,
+        "emotion": "neutral"
+    }
+}
+def load_tts_model():
+    """Load TTS model"""
+    global tts_model
+    try:
+        # Try to load Indonesian TTS model
+        # Using Coqui TTS with multilingual model
+        model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
+        logger.info(f"Loading TTS model: {model_name}")
+        tts_model = TTS(model_name)
+        # Set to use CPU for Hugging Face free tier
+        if torch.cuda.is_available():
+            tts_model = tts_model.to("cuda")
+            logger.info("Using GPU for TTS")
+        else:
+            logger.info("Using CPU for TTS")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to load TTS model: {e}")
+        # Fallback to simpler model
+        try:
+            model_name = "tts_models/en/ljspeech/tacotron2-DDC"
+            tts_model = TTS(model_name)
+            logger.info(f"Loaded fallback TTS model: {model_name}")
+            return True
+        except Exception as e2:
+            logger.error(f"Failed to load fallback model: {e2}")
+            return False
+def apply_voice_style(audio_data: torch.Tensor, sample_rate: int, style: str) -> torch.Tensor:
+    """Apply voice style modifications"""
+    try:
+        style_config = VOICE_STYLES.get(style, VOICE_STYLES["friendly"])
+        # Convert to numpy for librosa processing
+        audio_np = audio_data.numpy()
+        # Apply pitch shift for kawaii effect
+        if style_config["pitch_shift"] != 0:
+            audio_np = librosa.effects.pitch_shift(
+                audio_np,
+                sr=sample_rate,
+                n_steps=style_config["pitch_shift"]
+            )
+        # Apply speed change
+        if style_config["speed_factor"] != 1.0:
+            audio_np = librosa.effects.time_stretch(
+                audio_np,
+                rate=style_config["speed_factor"]
+            )
+        # Convert back to tensor
+        return torch.from_numpy(audio_np)
+    except Exception as e:
+        logger.warning(f"Failed to apply voice style: {e}")
+        return audio_data
+def enhance_indonesian_pronunciation(text: str) -> str:
+    """Enhance text for better Indonesian pronunciation"""
+    # Common Indonesian pronunciation fixes
+    replacements = {
+        # English words commonly used in Indonesian beauty context
+        "skincare": "skin-ker",
+        "makeup": "mek-ap",
+        "foundation": "faun-de-syen",
+        "concealer": "kon-si-ler",
+        "moisturizer": "mois-tu-rai-zer",
+        "cleanser": "klen-ser",
+        "toner": "to-ner",
+        "serum": "se-rum",
+        "sunscreen": "san-skrin",
+        # Beauty terms
+        "anti-aging": "anti-e-jing",
+        "oil-free": "oil-fri",
+        "non-comedogenic": "non-ko-me-do-je-nik",
+        # Common expressions
+        "guys": "gais",
+        "review": "ri-vyu",
+        "tutorial": "tu-to-ri-al",
+        "tips": "tips",
+        "routine": "ru-tin"
+    }
+    enhanced_text = text
+    for original, replacement in replacements.items():
+        enhanced_text = enhanced_text.replace(original, replacement)
+    return enhanced_text
+def add_kawaii_expressions(text: str) -> str:
+    """Add kawaii expressions to text"""
+    # Add cute expressions
+    kawaii_additions = {
+        "!": " nya!",
+        "?": " kah?",
+        ".": " desu~",
+    }
+    # Don't overdo it - only add to some sentences
+    if len(text) > 50 and "nya" not in text.lower():
+        # Add nya to exclamations
+        text = text.replace("!", " nya!")
+    return text
+@app.on_event("startup")
+async def startup_event():
+    """Initialize TTS model on startup"""
+    logger.info("Starting Indonesian TTS Service...")
+    success = load_tts_model()
+    if not success:
+        logger.error("Failed to load TTS model during startup")
+    else:
+        logger.info("TTS Service ready!")
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "model_loaded": tts_model is not None,
+        "available_styles": list(VOICE_STYLES.keys()),
+        "timestamp": datetime.now().isoformat()
+    }
+@app.post("/synthesize", response_model=TTSResponse)
+async def synthesize_speech(request: TTSRequest):
+    """Main TTS synthesis endpoint"""
+    start_time = time.time()
+    try:
+        if tts_model is None:
+            raise HTTPException(status_code=503, detail="TTS model not loaded")
+        # Enhance text for Indonesian pronunciation
+        enhanced_text = enhance_indonesian_pronunciation(request.text)
+        # Add kawaii expressions if requested
+        if request.voice_style == "kawaii":
+            enhanced_text = add_kawaii_expressions(enhanced_text)
+        logger.info(f"Synthesizing: {enhanced_text[:100]}...")
+        # Generate audio
+        # Create temporary output file
+        output_file = os.path.join(temp_dir, f"tts_{int(time.time())}.wav")
+        # Synthesize speech
+        if hasattr(tts_model, 'tts_to_file'):
+            # For XTTS model with speaker cloning
+            tts_model.tts_to_file(
+                text=enhanced_text,
+                file_path=output_file,
+                speaker_wav=None,  # Use default speaker
+                language="id" if request.language == "id" else "en"
+            )
+        else:
+            # For simpler models
+            tts_model.tts_to_file(
+                text=enhanced_text,
+                file_path=output_file
+            )
+        # Load and process audio
+        audio_data, sample_rate = torchaudio.load(output_file)
+        # Apply voice style
+        if request.voice_style in VOICE_STYLES:
+            audio_data = apply_voice_style(audio_data, sample_rate, request.voice_style)
+        # Apply speed adjustment
+        if request.speed != 1.0:
+            audio_np = audio_data.numpy()
+            audio_np = librosa.effects.time_stretch(audio_np, rate=request.speed)
+            audio_data = torch.from_numpy(audio_np)
+        # Save processed audio
+        processed_file = os.path.join(temp_dir, f"processed_{int(time.time())}.{request.format}")
+        if request.format.lower() == "wav":
+            torchaudio.save(processed_file, audio_data, sample_rate)
+        else:
+            # Convert to requested format using soundfile
+            sf.write(processed_file, audio_data.numpy().T, sample_rate)
+        # Calculate duration
+        duration = audio_data.shape[1] / sample_rate
+        processing_time = time.time() - start_time
+        # For Hugging Face Spaces, we need to return a URL
+        # In production, this would be uploaded to cloud storage
+        # For now, we'll return the file path as URL
+        audio_url = f"/audio/{os.path.basename(processed_file)}"
+        # Clean up original file
+        try:
+            os.remove(output_file)
+        except:
+            pass
+        logger.info(f"TTS synthesis completed in {processing_time:.2f}s")
+        return TTSResponse(
+            audio_url=audio_url,
+            format=request.format,
+            duration=duration,
+            processing_time=processing_time
+        )
+    except Exception as e:
+        logger.error(f"TTS synthesis error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/audio/{filename}")
+async def get_audio_file(filename: str):
+    """Serve generated audio files"""
+    file_path = os.path.join(temp_dir, filename)
+    if not os.path.exists(file_path):
+        raise HTTPException(status_code=404, detail="Audio file not found")
+    return FileResponse(
+        file_path,
+        media_type="audio/wav",
+        filename=filename
+    )
+@app.post("/test")
+async def test_synthesis():
+    """Test endpoint for quick TTS testing"""
+    test_text = "Halo semuanya! Selamat datang di live shopping kami! Hari ini ada banyak produk kecantikan yang bagus banget!"
+    request = TTSRequest(
+        text=test_text,
+        voice_style="kawaii",
+        speed=1.0
+    )
+    return await synthesize_speech(request)
+@app.get("/styles")
+async def get_voice_styles():
+    """Get available voice styles"""
+    return {
+        "styles": VOICE_STYLES,
+        "default": "kawaii",
+        "description": {
+            "kawaii": "Cute, high-pitched voice perfect for VTuber",
+            "friendly": "Warm and approachable voice",
+            "professional": "Clear and professional voice"
+        }
+    }
+@app.get("/")
+async def root():
+    """Root endpoint"""
+    return {
+        "service": "Indonesian TTS Service",
+        "version": "1.0.0",
+        "status": "running",
+        "supported_languages": ["id", "en"],
+        "endpoints": ["/synthesize", "/test", "/styles", "/health"]
+    }
+# Cleanup function
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Cleanup on shutdown"""
+    try:
+        # Clean up temporary files
+        import shutil
+        shutil.rmtree(temp_dir, ignore_errors=True)
+        logger.info("TTS Service shutdown completed")
+    except Exception as e:
+        logger.error(f"Shutdown error: {e}")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)  # Hugging Face Spaces default port

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi==0.104.1
+uvicorn==0.24.0
+TTS==0.20.6
+torch==2.1.1
+torchaudio==2.1.1
+librosa==0.10.1
+soundfile==0.12.1
+pydantic==2.5.0
+loguru==0.7.2
+numpy==1.24.3