Kokoro-TTS

Running on Zero

App Files Files Community

hysts HF Staff commited on May 31

Commit

8a49e4a

1 Parent(s): 691e7fc

Update

Browse files

Files changed (1) hide show

app.py +46 -32

app.py CHANGED Viewed

@@ -44,42 +44,56 @@ for v in CHOICES.values():
 @spaces.GPU(duration=30)
 def generate(text: str, voice: str = "af_heart", speed: float = 1.0) -> tuple[tuple[int, np.ndarray], str]:
-    """Generate audio from text using Kokoro TTS model.
     Available voices:
-    - af_heart
-    - af_bella
-    - af_nicole
-    - af_aoede
-    - af_kore
-    - af_sarah
-    - af_nova
-    - af_sky
-    - af_alloy
-    - af_jessica
-    - af_river
-    - am_michael
-    - am_fenrir
-    - am_puck
-    - am_echo
-    - am_eric
-    - am_liam
-    - am_onyx
-    - am_santa
-    - am_adam
-    - bf_emma
-    - bf_isabella
-    - bf_alice
-    - bf_lily
-    - bm_george
-    - bm_fable
-    - bm_lewis
-    - bm_daniel
     Args:
-        text: The text to generate audio from.
-        voice: The voice to use. Defaults to "af_heart".
-        speed: The speed of the audio. Defaults to 1.0.
     Returns:
         A tuple containing the audio and the tokens used to generate the audio.

 @spaces.GPU(duration=30)
 def generate(text: str, voice: str = "af_heart", speed: float = 1.0) -> tuple[tuple[int, np.ndarray], str]:
+    """Synthesizes speech from English text using the Kokoro TTS model.
+    Note:
+        This model supports only English input texts.
+    Voice Selection:
+        - The `voice` parameter specifies the speaker's characteristics and should follow the naming convention:
+        `<language/accent><gender>_<voice_name>`
+        - `<language/accent>`:
+            - 'a' for American English
+            - 'b' for British English
+        - `<gender>`:
+            - 'f' for female
+            - 'm' for male
+        - Example: 'af_heart' indicates an American English female voice named Heart.
     Available voices:
+        - af_heart
+        - af_bella
+        - af_nicole
+        - af_aoede
+        - af_kore
+        - af_sarah
+        - af_nova
+        - af_sky
+        - af_alloy
+        - af_jessica
+        - af_river
+        - am_michael
+        - am_fenrir
+        - am_puck
+        - am_echo
+        - am_eric
+        - am_liam
+        - am_onyx
+        - am_santa
+        - am_adam
+        - bf_emma
+        - bf_isabella
+        - bf_alice
+        - bf_lily
+        - bm_george
+        - bm_fable
+        - bm_lewis
+        - bm_daniel
     Args:
+        text: Input text to be synthesized. Only English text is supported. Non-English input may result in errors or mispronunciations.
+        voice: Identifier for the voice to be used in synthesis. Defaults to "af_heart".
+        speed: Playback speed multiplier. A value of 1.0 means normal speed; values above or below adjust the speech rate accordingly. Defaults to 1.0.
     Returns:
         A tuple containing the audio and the tokens used to generate the audio.