Spaces:
Running
Running
Upload 15 files
Browse files- .gitattributes +35 -0
- README.md +12 -0
- app.py +2 -0
- edge_tts/__init__.py +13 -0
- edge_tts/__main__.py +6 -0
- edge_tts/communicate.py +552 -0
- edge_tts/constants.py +37 -0
- edge_tts/data_classes.py +89 -0
- edge_tts/drm.py +133 -0
- edge_tts/exceptions.py +28 -0
- edge_tts/py.typed +0 -0
- edge_tts/typing.py +101 -0
- edge_tts/util.py +150 -0
- edge_tts/version.py +4 -0
- edge_tts/voices.py +124 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Probando No Funciona20
|
3 |
+
emoji: 👁
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.20.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
11 |
+
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
exec(os.environ.get('APP'))
|
edge_tts/__init__.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from . import exceptions
|
2 |
+
from .communicate import Communicate
|
3 |
+
from .version import __version__, __version_info__
|
4 |
+
from .voices import VoicesManager, list_voices
|
5 |
+
|
6 |
+
__all__ = [
|
7 |
+
"Communicate",
|
8 |
+
"exceptions",
|
9 |
+
"__version__",
|
10 |
+
"__version_info__",
|
11 |
+
"VoicesManager",
|
12 |
+
"list_voices",
|
13 |
+
]
|
edge_tts/__main__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Main entrypoint for the edge-tts package."""
|
2 |
+
|
3 |
+
from .util import main
|
4 |
+
|
5 |
+
if __name__ == "__main__":
|
6 |
+
main()
|
edge_tts/communicate.py
ADDED
@@ -0,0 +1,552 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Communicate with the service. Only the Communicate class should be used by
|
2 |
+
end-users. The other classes and functions are for internal use only."""
|
3 |
+
|
4 |
+
import asyncio
|
5 |
+
import concurrent.futures
|
6 |
+
import json
|
7 |
+
import ssl
|
8 |
+
import time
|
9 |
+
import uuid
|
10 |
+
from contextlib import nullcontext
|
11 |
+
from io import TextIOWrapper
|
12 |
+
from queue import Queue
|
13 |
+
from typing import (
|
14 |
+
AsyncGenerator,
|
15 |
+
ContextManager,
|
16 |
+
Dict,
|
17 |
+
Generator,
|
18 |
+
List,
|
19 |
+
Optional,
|
20 |
+
Tuple,
|
21 |
+
Union,
|
22 |
+
)
|
23 |
+
from xml.sax.saxutils import escape
|
24 |
+
|
25 |
+
import aiohttp
|
26 |
+
import certifi
|
27 |
+
|
28 |
+
from .constants import DEFAULT_VOICE, SEC_MS_GEC_VERSION, WSS_HEADERS, WSS_URL
|
29 |
+
from .data_classes import TTSConfig
|
30 |
+
from .drm import DRM
|
31 |
+
from .exceptions import (
|
32 |
+
NoAudioReceived,
|
33 |
+
UnexpectedResponse,
|
34 |
+
UnknownResponse,
|
35 |
+
WebSocketError,
|
36 |
+
)
|
37 |
+
from .typing import CommunicateState, TTSChunk
|
38 |
+
|
39 |
+
|
40 |
+
def get_headers_and_data(
|
41 |
+
data: bytes, header_length: int
|
42 |
+
) -> Tuple[Dict[bytes, bytes], bytes]:
|
43 |
+
"""
|
44 |
+
Returns the headers and data from the given data.
|
45 |
+
|
46 |
+
Args:
|
47 |
+
data (bytes): The data to be parsed.
|
48 |
+
header_length (int): The length of the header.
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
tuple: The headers and data to be used in the request.
|
52 |
+
"""
|
53 |
+
if not isinstance(data, bytes):
|
54 |
+
raise TypeError("data must be bytes")
|
55 |
+
|
56 |
+
headers = {}
|
57 |
+
for line in data[:header_length].split(b"\r\n"):
|
58 |
+
key, value = line.split(b":", 1)
|
59 |
+
headers[key] = value
|
60 |
+
|
61 |
+
return headers, data[header_length + 2 :]
|
62 |
+
|
63 |
+
|
64 |
+
def remove_incompatible_characters(string: Union[str, bytes]) -> str:
|
65 |
+
"""
|
66 |
+
The service does not support a couple character ranges.
|
67 |
+
Most important being the vertical tab character which is
|
68 |
+
commonly present in OCR-ed PDFs. Not doing this will
|
69 |
+
result in an error from the service.
|
70 |
+
|
71 |
+
Args:
|
72 |
+
string (str or bytes): The string to be cleaned.
|
73 |
+
|
74 |
+
Returns:
|
75 |
+
str: The cleaned string.
|
76 |
+
"""
|
77 |
+
if isinstance(string, bytes):
|
78 |
+
string = string.decode("utf-8")
|
79 |
+
if not isinstance(string, str):
|
80 |
+
raise TypeError("string must be str or bytes")
|
81 |
+
|
82 |
+
chars: List[str] = list(string)
|
83 |
+
|
84 |
+
for idx, char in enumerate(chars):
|
85 |
+
code: int = ord(char)
|
86 |
+
if (0 <= code <= 8) or (11 <= code <= 12) or (14 <= code <= 31):
|
87 |
+
chars[idx] = " "
|
88 |
+
|
89 |
+
return "".join(chars)
|
90 |
+
|
91 |
+
|
92 |
+
def connect_id() -> str:
|
93 |
+
"""
|
94 |
+
Returns a UUID without dashes.
|
95 |
+
|
96 |
+
Returns:
|
97 |
+
str: A UUID without dashes.
|
98 |
+
"""
|
99 |
+
return str(uuid.uuid4()).replace("-", "")
|
100 |
+
|
101 |
+
|
102 |
+
def split_text_by_byte_length(
|
103 |
+
text: Union[str, bytes], byte_length: int
|
104 |
+
) -> Generator[bytes, None, None]:
|
105 |
+
"""
|
106 |
+
Splits a string into a list of strings of a given byte length
|
107 |
+
while attempting to keep words together. This function assumes
|
108 |
+
text will be inside of an XML tag.
|
109 |
+
|
110 |
+
Args:
|
111 |
+
text (str or bytes): The string to be split. If bytes, it must be UTF-8 encoded.
|
112 |
+
byte_length (int): The maximum byte length of each string in the list.
|
113 |
+
|
114 |
+
Yield:
|
115 |
+
bytes: The next string in the list.
|
116 |
+
"""
|
117 |
+
if isinstance(text, str):
|
118 |
+
text = text.encode("utf-8")
|
119 |
+
if not isinstance(text, bytes):
|
120 |
+
raise TypeError("text must be str or bytes")
|
121 |
+
|
122 |
+
if byte_length <= 0:
|
123 |
+
raise ValueError("byte_length must be greater than 0")
|
124 |
+
|
125 |
+
while len(text) > byte_length:
|
126 |
+
# Find the last space in the string
|
127 |
+
split_at = text.rfind(b" ", 0, byte_length)
|
128 |
+
|
129 |
+
# If no space found, split_at is byte_length
|
130 |
+
split_at = split_at if split_at != -1 else byte_length
|
131 |
+
|
132 |
+
# Verify all & are terminated with a ;
|
133 |
+
while b"&" in text[:split_at]:
|
134 |
+
ampersand_index = text.rindex(b"&", 0, split_at)
|
135 |
+
if text.find(b";", ampersand_index, split_at) != -1:
|
136 |
+
break
|
137 |
+
|
138 |
+
split_at = ampersand_index - 1
|
139 |
+
if split_at < 0:
|
140 |
+
raise ValueError("Maximum byte length is too small or invalid text")
|
141 |
+
if split_at == 0:
|
142 |
+
break
|
143 |
+
|
144 |
+
# Append the string to the list
|
145 |
+
new_text = text[:split_at].strip()
|
146 |
+
if new_text:
|
147 |
+
yield new_text
|
148 |
+
if split_at == 0:
|
149 |
+
split_at = 1
|
150 |
+
text = text[split_at:]
|
151 |
+
|
152 |
+
new_text = text.strip()
|
153 |
+
if new_text:
|
154 |
+
yield new_text
|
155 |
+
|
156 |
+
|
157 |
+
def mkssml(tc: TTSConfig, escaped_text: Union[str, bytes]) -> str:
|
158 |
+
"""
|
159 |
+
Creates a SSML string from the given parameters.
|
160 |
+
|
161 |
+
Args:
|
162 |
+
tc (TTSConfig): The TTS configuration.
|
163 |
+
escaped_text (str or bytes): The escaped text. If bytes, it must be UTF-8 encoded.
|
164 |
+
|
165 |
+
Returns:
|
166 |
+
str: The SSML string.
|
167 |
+
"""
|
168 |
+
if isinstance(escaped_text, bytes):
|
169 |
+
escaped_text = escaped_text.decode("utf-8")
|
170 |
+
|
171 |
+
return (
|
172 |
+
"<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'>"
|
173 |
+
f"<voice name='{tc.voice}'>"
|
174 |
+
f"<prosody pitch='{tc.pitch}' rate='{tc.rate}' volume='{tc.volume}'>"
|
175 |
+
f"{escaped_text}"
|
176 |
+
"</prosody>"
|
177 |
+
"</voice>"
|
178 |
+
"</speak>"
|
179 |
+
)
|
180 |
+
|
181 |
+
|
182 |
+
def date_to_string() -> str:
|
183 |
+
"""
|
184 |
+
Return Javascript-style date string.
|
185 |
+
|
186 |
+
Returns:
|
187 |
+
str: Javascript-style date string.
|
188 |
+
"""
|
189 |
+
# %Z is not what we want, but it's the only way to get the timezone
|
190 |
+
# without having to use a library. We'll just use UTC and hope for the best.
|
191 |
+
# For example, right now %Z would return EEST when we need it to return
|
192 |
+
# Eastern European Summer Time.
|
193 |
+
return time.strftime(
|
194 |
+
"%a %b %d %Y %H:%M:%S GMT+0000 (Coordinated Universal Time)", time.gmtime()
|
195 |
+
)
|
196 |
+
|
197 |
+
|
198 |
+
def ssml_headers_plus_data(request_id: str, timestamp: str, ssml: str) -> str:
|
199 |
+
"""
|
200 |
+
Returns the headers and data to be used in the request.
|
201 |
+
|
202 |
+
Returns:
|
203 |
+
str: The headers and data to be used in the request.
|
204 |
+
"""
|
205 |
+
|
206 |
+
return (
|
207 |
+
f"X-RequestId:{request_id}\r\n"
|
208 |
+
"Content-Type:application/ssml+xml\r\n"
|
209 |
+
f"X-Timestamp:{timestamp}Z\r\n" # This is not a mistake, Microsoft Edge bug.
|
210 |
+
"Path:ssml\r\n\r\n"
|
211 |
+
f"{ssml}"
|
212 |
+
)
|
213 |
+
|
214 |
+
|
215 |
+
def calc_max_mesg_size(tts_config: TTSConfig) -> int:
|
216 |
+
"""Calculates the maximum message size for the given voice, rate, and volume.
|
217 |
+
|
218 |
+
Returns:
|
219 |
+
int: The maximum message size.
|
220 |
+
"""
|
221 |
+
websocket_max_size: int = 2**16
|
222 |
+
overhead_per_message: int = (
|
223 |
+
len(
|
224 |
+
ssml_headers_plus_data(
|
225 |
+
connect_id(),
|
226 |
+
date_to_string(),
|
227 |
+
mkssml(tts_config, ""),
|
228 |
+
)
|
229 |
+
)
|
230 |
+
+ 50 # margin of error
|
231 |
+
)
|
232 |
+
return websocket_max_size - overhead_per_message
|
233 |
+
|
234 |
+
|
235 |
+
class Communicate:
|
236 |
+
"""
|
237 |
+
Communicate with the service.
|
238 |
+
"""
|
239 |
+
|
240 |
+
def __init__(
|
241 |
+
self,
|
242 |
+
text: str,
|
243 |
+
voice: str = DEFAULT_VOICE,
|
244 |
+
*,
|
245 |
+
rate: str = "+0%",
|
246 |
+
volume: str = "+0%",
|
247 |
+
pitch: str = "+0Hz",
|
248 |
+
connector: Optional[aiohttp.BaseConnector] = None,
|
249 |
+
proxy: Optional[str] = None,
|
250 |
+
connect_timeout: Optional[int] = 10,
|
251 |
+
receive_timeout: Optional[int] = 60,
|
252 |
+
):
|
253 |
+
# Validate TTS settings and store the TTSConfig object.
|
254 |
+
self.tts_config = TTSConfig(voice, rate, volume, pitch)
|
255 |
+
|
256 |
+
# Validate the text parameter.
|
257 |
+
if not isinstance(text, str):
|
258 |
+
raise TypeError("text must be str")
|
259 |
+
|
260 |
+
# Split the text into multiple strings and store them.
|
261 |
+
self.texts = split_text_by_byte_length(
|
262 |
+
escape(remove_incompatible_characters(text)),
|
263 |
+
calc_max_mesg_size(self.tts_config),
|
264 |
+
)
|
265 |
+
|
266 |
+
# Validate the proxy parameter.
|
267 |
+
if proxy is not None and not isinstance(proxy, str):
|
268 |
+
raise TypeError("proxy must be str")
|
269 |
+
self.proxy: Optional[str] = proxy
|
270 |
+
|
271 |
+
# Validate the timeout parameters.
|
272 |
+
if not isinstance(connect_timeout, int):
|
273 |
+
raise TypeError("connect_timeout must be int")
|
274 |
+
if not isinstance(receive_timeout, int):
|
275 |
+
raise TypeError("receive_timeout must be int")
|
276 |
+
self.session_timeout = aiohttp.ClientTimeout(
|
277 |
+
total=None,
|
278 |
+
connect=None,
|
279 |
+
sock_connect=connect_timeout,
|
280 |
+
sock_read=receive_timeout,
|
281 |
+
)
|
282 |
+
|
283 |
+
# Validate the connector parameter.
|
284 |
+
if connector is not None and not isinstance(connector, aiohttp.BaseConnector):
|
285 |
+
raise TypeError("connector must be aiohttp.BaseConnector")
|
286 |
+
self.connector: Optional[aiohttp.BaseConnector] = connector
|
287 |
+
|
288 |
+
# Store current state of TTS.
|
289 |
+
self.state: CommunicateState = {
|
290 |
+
"partial_text": b"",
|
291 |
+
"offset_compensation": 0,
|
292 |
+
"last_duration_offset": 0,
|
293 |
+
"stream_was_called": False,
|
294 |
+
}
|
295 |
+
|
296 |
+
def __parse_metadata(self, data: bytes) -> TTSChunk:
|
297 |
+
for meta_obj in json.loads(data)["Metadata"]:
|
298 |
+
meta_type = meta_obj["Type"]
|
299 |
+
if meta_type == "WordBoundary":
|
300 |
+
current_offset = (
|
301 |
+
meta_obj["Data"]["Offset"] + self.state["offset_compensation"]
|
302 |
+
)
|
303 |
+
current_duration = meta_obj["Data"]["Duration"]
|
304 |
+
return {
|
305 |
+
"type": meta_type,
|
306 |
+
"offset": current_offset,
|
307 |
+
"duration": current_duration,
|
308 |
+
"text": meta_obj["Data"]["text"]["Text"],
|
309 |
+
}
|
310 |
+
if meta_type in ("SessionEnd",):
|
311 |
+
continue
|
312 |
+
raise UnknownResponse(f"Unknown metadata type: {meta_type}")
|
313 |
+
raise UnexpectedResponse("No WordBoundary metadata found")
|
314 |
+
|
315 |
+
async def __stream(self) -> AsyncGenerator[TTSChunk, None]:
|
316 |
+
async def send_command_request() -> None:
|
317 |
+
"""Sends the command request to the service."""
|
318 |
+
await websocket.send_str(
|
319 |
+
f"X-Timestamp:{date_to_string()}\r\n"
|
320 |
+
"Content-Type:application/json; charset=utf-8\r\n"
|
321 |
+
"Path:speech.config\r\n\r\n"
|
322 |
+
'{"context":{"synthesis":{"audio":{"metadataoptions":{'
|
323 |
+
'"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"true"},'
|
324 |
+
'"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
|
325 |
+
"}}}}\r\n"
|
326 |
+
)
|
327 |
+
|
328 |
+
async def send_ssml_request() -> None:
|
329 |
+
"""Sends the SSML request to the service."""
|
330 |
+
await websocket.send_str(
|
331 |
+
ssml_headers_plus_data(
|
332 |
+
connect_id(),
|
333 |
+
date_to_string(),
|
334 |
+
mkssml(
|
335 |
+
self.tts_config,
|
336 |
+
self.state["partial_text"],
|
337 |
+
),
|
338 |
+
)
|
339 |
+
)
|
340 |
+
|
341 |
+
# audio_was_received indicates whether we have received audio data
|
342 |
+
# from the websocket. This is so we can raise an exception if we
|
343 |
+
# don't receive any audio data.
|
344 |
+
audio_was_received = False
|
345 |
+
|
346 |
+
# Create a new connection to the service.
|
347 |
+
ssl_ctx = ssl.create_default_context(cafile=certifi.where())
|
348 |
+
async with aiohttp.ClientSession(
|
349 |
+
connector=self.connector,
|
350 |
+
trust_env=True,
|
351 |
+
timeout=self.session_timeout,
|
352 |
+
) as session, session.ws_connect(
|
353 |
+
f"{WSS_URL}&Sec-MS-GEC={DRM.generate_sec_ms_gec()}"
|
354 |
+
f"&Sec-MS-GEC-Version={SEC_MS_GEC_VERSION}"
|
355 |
+
f"&ConnectionId={connect_id()}",
|
356 |
+
compress=15,
|
357 |
+
proxy=self.proxy,
|
358 |
+
headers=WSS_HEADERS,
|
359 |
+
ssl=ssl_ctx,
|
360 |
+
) as websocket:
|
361 |
+
await send_command_request()
|
362 |
+
|
363 |
+
await send_ssml_request()
|
364 |
+
|
365 |
+
async for received in websocket:
|
366 |
+
if received.type == aiohttp.WSMsgType.TEXT:
|
367 |
+
encoded_data: bytes = received.data.encode("utf-8")
|
368 |
+
parameters, data = get_headers_and_data(
|
369 |
+
encoded_data, encoded_data.find(b"\r\n\r\n")
|
370 |
+
)
|
371 |
+
|
372 |
+
path = parameters.get(b"Path", None)
|
373 |
+
if path == b"audio.metadata":
|
374 |
+
# Parse the metadata and yield it.
|
375 |
+
parsed_metadata = self.__parse_metadata(data)
|
376 |
+
yield parsed_metadata
|
377 |
+
|
378 |
+
# Update the last duration offset for use by the next SSML request.
|
379 |
+
self.state["last_duration_offset"] = (
|
380 |
+
parsed_metadata["offset"] + parsed_metadata["duration"]
|
381 |
+
)
|
382 |
+
elif path == b"turn.end":
|
383 |
+
# Update the offset compensation for the next SSML request.
|
384 |
+
self.state["offset_compensation"] = self.state[
|
385 |
+
"last_duration_offset"
|
386 |
+
]
|
387 |
+
|
388 |
+
# Use average padding typically added by the service
|
389 |
+
# to the end of the audio data. This seems to work pretty
|
390 |
+
# well for now, but we might ultimately need to use a
|
391 |
+
# more sophisticated method like using ffmpeg to get
|
392 |
+
# the actual duration of the audio data.
|
393 |
+
self.state["offset_compensation"] += 8_750_000
|
394 |
+
|
395 |
+
# Exit the loop so we can send the next SSML request.
|
396 |
+
break
|
397 |
+
elif path not in (b"response", b"turn.start"):
|
398 |
+
raise UnknownResponse("Unknown path received")
|
399 |
+
elif received.type == aiohttp.WSMsgType.BINARY:
|
400 |
+
# Message is too short to contain header length.
|
401 |
+
if len(received.data) < 2:
|
402 |
+
raise UnexpectedResponse(
|
403 |
+
"We received a binary message, but it is missing the header length."
|
404 |
+
)
|
405 |
+
|
406 |
+
# The first two bytes of the binary message contain the header length.
|
407 |
+
header_length = int.from_bytes(received.data[:2], "big")
|
408 |
+
if header_length > len(received.data):
|
409 |
+
raise UnexpectedResponse(
|
410 |
+
"The header length is greater than the length of the data."
|
411 |
+
)
|
412 |
+
|
413 |
+
# Parse the headers and data from the binary message.
|
414 |
+
parameters, data = get_headers_and_data(
|
415 |
+
received.data, header_length
|
416 |
+
)
|
417 |
+
|
418 |
+
# Check if the path is audio.
|
419 |
+
if parameters.get(b"Path") != b"audio":
|
420 |
+
raise UnexpectedResponse(
|
421 |
+
"Received binary message, but the path is not audio."
|
422 |
+
)
|
423 |
+
|
424 |
+
# At termination of the stream, the service sends a binary message
|
425 |
+
# with no Content-Type; this is expected. What is not expected is for
|
426 |
+
# an MPEG audio stream to be sent with no data.
|
427 |
+
content_type = parameters.get(b"Content-Type", None)
|
428 |
+
if content_type not in [b"audio/mpeg", None]:
|
429 |
+
raise UnexpectedResponse(
|
430 |
+
"Received binary message, but with an unexpected Content-Type."
|
431 |
+
)
|
432 |
+
|
433 |
+
# We only allow no Content-Type if there is no data.
|
434 |
+
if content_type is None:
|
435 |
+
if len(data) == 0:
|
436 |
+
continue
|
437 |
+
|
438 |
+
# If the data is not empty, then we need to raise an exception.
|
439 |
+
raise UnexpectedResponse(
|
440 |
+
"Received binary message with no Content-Type, but with data."
|
441 |
+
)
|
442 |
+
|
443 |
+
# If the data is empty now, then we need to raise an exception.
|
444 |
+
if len(data) == 0:
|
445 |
+
raise UnexpectedResponse(
|
446 |
+
"Received binary message, but it is missing the audio data."
|
447 |
+
)
|
448 |
+
|
449 |
+
# Yield the audio data.
|
450 |
+
audio_was_received = True
|
451 |
+
yield {"type": "audio", "data": data}
|
452 |
+
elif received.type == aiohttp.WSMsgType.ERROR:
|
453 |
+
raise WebSocketError(
|
454 |
+
received.data if received.data else "Unknown error"
|
455 |
+
)
|
456 |
+
|
457 |
+
if not audio_was_received:
|
458 |
+
raise NoAudioReceived(
|
459 |
+
"No audio was received. Please verify that your parameters are correct."
|
460 |
+
)
|
461 |
+
|
462 |
+
async def stream(
|
463 |
+
self,
|
464 |
+
) -> AsyncGenerator[TTSChunk, None]:
|
465 |
+
"""
|
466 |
+
Streams audio and metadata from the service.
|
467 |
+
|
468 |
+
Raises:
|
469 |
+
NoAudioReceived: If no audio is received from the service.
|
470 |
+
UnexpectedResponse: If the response from the service is unexpected.
|
471 |
+
UnknownResponse: If the response from the service is unknown.
|
472 |
+
WebSocketError: If there is an error with the websocket.
|
473 |
+
"""
|
474 |
+
|
475 |
+
# Check if stream was called before.
|
476 |
+
if self.state["stream_was_called"]:
|
477 |
+
raise RuntimeError("stream can only be called once.")
|
478 |
+
self.state["stream_was_called"] = True
|
479 |
+
|
480 |
+
# Stream the audio and metadata from the service.
|
481 |
+
for self.state["partial_text"] in self.texts:
|
482 |
+
try:
|
483 |
+
async for message in self.__stream():
|
484 |
+
yield message
|
485 |
+
except aiohttp.ClientResponseError as e:
|
486 |
+
if e.status != 403:
|
487 |
+
raise
|
488 |
+
|
489 |
+
DRM.handle_client_response_error(e)
|
490 |
+
async for message in self.__stream():
|
491 |
+
yield message
|
492 |
+
|
493 |
+
async def save(
|
494 |
+
self,
|
495 |
+
audio_fname: Union[str, bytes],
|
496 |
+
metadata_fname: Optional[Union[str, bytes]] = None,
|
497 |
+
) -> None:
|
498 |
+
"""
|
499 |
+
Save the audio and metadata to the specified files.
|
500 |
+
"""
|
501 |
+
metadata: Union[TextIOWrapper, ContextManager[None]] = (
|
502 |
+
open(metadata_fname, "w", encoding="utf-8")
|
503 |
+
if metadata_fname is not None
|
504 |
+
else nullcontext()
|
505 |
+
)
|
506 |
+
with metadata, open(audio_fname, "wb") as audio:
|
507 |
+
async for message in self.stream():
|
508 |
+
if message["type"] == "audio":
|
509 |
+
audio.write(message["data"])
|
510 |
+
elif (
|
511 |
+
isinstance(metadata, TextIOWrapper)
|
512 |
+
and message["type"] == "WordBoundary"
|
513 |
+
):
|
514 |
+
json.dump(message, metadata)
|
515 |
+
metadata.write("\n")
|
516 |
+
|
517 |
+
def stream_sync(self) -> Generator[TTSChunk, None, None]:
|
518 |
+
"""Synchronous interface for async stream method"""
|
519 |
+
|
520 |
+
def fetch_async_items(queue: Queue) -> None: # type: ignore
|
521 |
+
async def get_items() -> None:
|
522 |
+
async for item in self.stream():
|
523 |
+
queue.put(item)
|
524 |
+
queue.put(None)
|
525 |
+
|
526 |
+
loop = asyncio.new_event_loop()
|
527 |
+
asyncio.set_event_loop(loop)
|
528 |
+
loop.run_until_complete(get_items())
|
529 |
+
loop.close()
|
530 |
+
|
531 |
+
queue: Queue = Queue() # type: ignore
|
532 |
+
|
533 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
534 |
+
executor.submit(fetch_async_items, queue)
|
535 |
+
|
536 |
+
while True:
|
537 |
+
item = queue.get()
|
538 |
+
if item is None:
|
539 |
+
break
|
540 |
+
yield item
|
541 |
+
|
542 |
+
def save_sync(
|
543 |
+
self,
|
544 |
+
audio_fname: Union[str, bytes],
|
545 |
+
metadata_fname: Optional[Union[str, bytes]] = None,
|
546 |
+
) -> None:
|
547 |
+
"""Synchronous interface for async save method."""
|
548 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
549 |
+
future = executor.submit(
|
550 |
+
asyncio.run, self.save(audio_fname, metadata_fname)
|
551 |
+
)
|
552 |
+
future.result()
|
edge_tts/constants.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Constants for the edge_tts package."""
|
2 |
+
|
3 |
+
BASE_URL = "speech.platform.bing.com/consumer/speech/synthesize/readaloud"
|
4 |
+
TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4"
|
5 |
+
|
6 |
+
WSS_URL = f"wss://{BASE_URL}/edge/v1?TrustedClientToken={TRUSTED_CLIENT_TOKEN}"
|
7 |
+
VOICE_LIST = f"https://{BASE_URL}/voices/list?trustedclienttoken={TRUSTED_CLIENT_TOKEN}"
|
8 |
+
|
9 |
+
DEFAULT_VOICE = "en-US-EmmaMultilingualNeural"
|
10 |
+
|
11 |
+
CHROMIUM_FULL_VERSION = "130.0.2849.68"
|
12 |
+
CHROMIUM_MAJOR_VERSION = CHROMIUM_FULL_VERSION.split(".", maxsplit=1)[0]
|
13 |
+
SEC_MS_GEC_VERSION = f"1-{CHROMIUM_FULL_VERSION}"
|
14 |
+
BASE_HEADERS = {
|
15 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
16 |
+
f" (KHTML, like Gecko) Chrome/{CHROMIUM_MAJOR_VERSION}.0.0.0 Safari/537.36"
|
17 |
+
f" Edg/{CHROMIUM_MAJOR_VERSION}.0.0.0",
|
18 |
+
"Accept-Encoding": "gzip, deflate, br",
|
19 |
+
"Accept-Language": "en-US,en;q=0.9",
|
20 |
+
}
|
21 |
+
WSS_HEADERS = {
|
22 |
+
"Pragma": "no-cache",
|
23 |
+
"Cache-Control": "no-cache",
|
24 |
+
"Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
|
25 |
+
}
|
26 |
+
WSS_HEADERS.update(BASE_HEADERS)
|
27 |
+
VOICE_HEADERS = {
|
28 |
+
"Authority": "speech.platform.bing.com",
|
29 |
+
"Sec-CH-UA": f'" Not;A Brand";v="99", "Microsoft Edge";v="{CHROMIUM_MAJOR_VERSION}",'
|
30 |
+
f' "Chromium";v="{CHROMIUM_MAJOR_VERSION}"',
|
31 |
+
"Sec-CH-UA-Mobile": "?0",
|
32 |
+
"Accept": "*/*",
|
33 |
+
"Sec-Fetch-Site": "none",
|
34 |
+
"Sec-Fetch-Mode": "cors",
|
35 |
+
"Sec-Fetch-Dest": "empty",
|
36 |
+
}
|
37 |
+
VOICE_HEADERS.update(BASE_HEADERS)
|
edge_tts/data_classes.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Data models for edge-tts."""
|
2 |
+
|
3 |
+
# pylint: disable=too-few-public-methods
|
4 |
+
|
5 |
+
import argparse
|
6 |
+
import re
|
7 |
+
from dataclasses import dataclass
|
8 |
+
|
9 |
+
|
10 |
+
@dataclass
|
11 |
+
class TTSConfig:
|
12 |
+
"""
|
13 |
+
Represents the internal TTS configuration for edge-tts's Communicate class.
|
14 |
+
"""
|
15 |
+
|
16 |
+
voice: str
|
17 |
+
rate: str
|
18 |
+
volume: str
|
19 |
+
pitch: str
|
20 |
+
|
21 |
+
@staticmethod
|
22 |
+
def validate_string_param(param_name: str, param_value: str, pattern: str) -> str:
|
23 |
+
"""
|
24 |
+
Validates the given string parameter based on type and pattern.
|
25 |
+
|
26 |
+
Args:
|
27 |
+
param_name (str): The name of the parameter.
|
28 |
+
param_value (str): The value of the parameter.
|
29 |
+
pattern (str): The pattern to validate the parameter against.
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
str: The validated parameter.
|
33 |
+
"""
|
34 |
+
if not isinstance(param_value, str):
|
35 |
+
raise TypeError(f"{param_name} must be str")
|
36 |
+
if re.match(pattern, param_value) is None:
|
37 |
+
raise ValueError(f"Invalid {param_name} '{param_value}'.")
|
38 |
+
return param_value
|
39 |
+
|
40 |
+
def __post_init__(self) -> None:
|
41 |
+
"""
|
42 |
+
Validates the TTSConfig object after initialization.
|
43 |
+
"""
|
44 |
+
|
45 |
+
# Possible values for voice are:
|
46 |
+
# - Microsoft Server Speech Text to Speech Voice (cy-GB, NiaNeural)
|
47 |
+
# - cy-GB-NiaNeural
|
48 |
+
# - fil-PH-AngeloNeural
|
49 |
+
# Always send the first variant as that is what Microsoft Edge does.
|
50 |
+
if not isinstance(self.voice, str):
|
51 |
+
raise TypeError("voice must be str")
|
52 |
+
match = re.match(r"^([a-z]{2,})-([A-Z]{2,})-(.+Neural)$", self.voice)
|
53 |
+
if match is not None:
|
54 |
+
lang = match.group(1)
|
55 |
+
region = match.group(2)
|
56 |
+
name = match.group(3)
|
57 |
+
if name.find("-") != -1:
|
58 |
+
region = region + "-" + name[: name.find("-")]
|
59 |
+
name = name[name.find("-") + 1 :]
|
60 |
+
self.voice = (
|
61 |
+
"Microsoft Server Speech Text to Speech Voice"
|
62 |
+
+ f" ({lang}-{region}, {name})"
|
63 |
+
)
|
64 |
+
|
65 |
+
# Validate the rate, volume, and pitch parameters.
|
66 |
+
self.validate_string_param(
|
67 |
+
"voice",
|
68 |
+
self.voice,
|
69 |
+
r"^Microsoft Server Speech Text to Speech Voice \(.+,.+\)$",
|
70 |
+
)
|
71 |
+
self.validate_string_param("rate", self.rate, r"^[+-]\d+%$")
|
72 |
+
self.validate_string_param("volume", self.volume, r"^[+-]\d+%$")
|
73 |
+
self.validate_string_param("pitch", self.pitch, r"^[+-]\d+Hz$")
|
74 |
+
|
75 |
+
|
76 |
+
class UtilArgs(argparse.Namespace):
|
77 |
+
"""CLI arguments."""
|
78 |
+
|
79 |
+
text: str
|
80 |
+
file: str
|
81 |
+
voice: str
|
82 |
+
list_voices: bool
|
83 |
+
rate: str
|
84 |
+
volume: str
|
85 |
+
pitch: str
|
86 |
+
words_in_cue: int
|
87 |
+
write_media: str
|
88 |
+
write_subtitles: str
|
89 |
+
proxy: str
|
edge_tts/drm.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""DRM module is used to handle DRM operations with clock skew correction.
|
2 |
+
Currently the only DRM operation is generating the Sec-MS-GEC token value
|
3 |
+
used in all API requests to Microsoft Edge's online text-to-speech service."""
|
4 |
+
|
5 |
+
import hashlib
|
6 |
+
from datetime import datetime as dt
|
7 |
+
from datetime import timezone as tz
|
8 |
+
from typing import Optional
|
9 |
+
|
10 |
+
import aiohttp
|
11 |
+
|
12 |
+
from .constants import TRUSTED_CLIENT_TOKEN
|
13 |
+
from .exceptions import SkewAdjustmentError
|
14 |
+
|
15 |
+
WIN_EPOCH = 11644473600
|
16 |
+
S_TO_NS = 1e9
|
17 |
+
|
18 |
+
|
19 |
+
class DRM:
|
20 |
+
"""
|
21 |
+
Class to handle DRM operations with clock skew correction.
|
22 |
+
"""
|
23 |
+
|
24 |
+
clock_skew_seconds: float = 0.0
|
25 |
+
|
26 |
+
@staticmethod
|
27 |
+
def adj_clock_skew_seconds(skew_seconds: float) -> None:
|
28 |
+
"""
|
29 |
+
Adjust the clock skew in seconds in case the system clock is off.
|
30 |
+
|
31 |
+
This method updates the `clock_skew_seconds` attribute of the DRM class
|
32 |
+
to the specified number of seconds.
|
33 |
+
|
34 |
+
Args:
|
35 |
+
skew_seconds (float): The number of seconds to adjust the clock skew to.
|
36 |
+
|
37 |
+
Returns:
|
38 |
+
None
|
39 |
+
"""
|
40 |
+
DRM.clock_skew_seconds += skew_seconds
|
41 |
+
|
42 |
+
@staticmethod
|
43 |
+
def get_unix_timestamp() -> float:
|
44 |
+
"""
|
45 |
+
Gets the current timestamp in Unix format with clock skew correction.
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
float: The current timestamp in Unix format with clock skew correction.
|
49 |
+
"""
|
50 |
+
return dt.now(tz.utc).timestamp() + DRM.clock_skew_seconds
|
51 |
+
|
52 |
+
@staticmethod
|
53 |
+
def parse_rfc2616_date(date: str) -> Optional[float]:
|
54 |
+
"""
|
55 |
+
Parses an RFC 2616 date string into a Unix timestamp.
|
56 |
+
|
57 |
+
This function parses an RFC 2616 date string into a Unix timestamp.
|
58 |
+
|
59 |
+
Args:
|
60 |
+
date (str): RFC 2616 date string to parse.
|
61 |
+
|
62 |
+
Returns:
|
63 |
+
Optional[float]: Unix timestamp of the parsed date string, or None if parsing failed.
|
64 |
+
"""
|
65 |
+
try:
|
66 |
+
return (
|
67 |
+
dt.strptime(date, "%a, %d %b %Y %H:%M:%S %Z")
|
68 |
+
.replace(tzinfo=tz.utc)
|
69 |
+
.timestamp()
|
70 |
+
)
|
71 |
+
except ValueError:
|
72 |
+
return None
|
73 |
+
|
74 |
+
@staticmethod
|
75 |
+
def handle_client_response_error(e: aiohttp.ClientResponseError) -> None:
|
76 |
+
"""
|
77 |
+
Handle a client response error.
|
78 |
+
|
79 |
+
This method adjusts the clock skew based on the server date in the response headers
|
80 |
+
and raises a SkewAdjustmentError if the server date is missing or invalid.
|
81 |
+
|
82 |
+
Args:
|
83 |
+
e (Exception): The client response error to handle.
|
84 |
+
|
85 |
+
Returns:
|
86 |
+
None
|
87 |
+
"""
|
88 |
+
if e.headers is None:
|
89 |
+
raise SkewAdjustmentError("No server date in headers.") from e
|
90 |
+
server_date: Optional[str] = e.headers.get("Date", None)
|
91 |
+
if server_date is None or not isinstance(server_date, str):
|
92 |
+
raise SkewAdjustmentError("No server date in headers.") from e
|
93 |
+
server_date_parsed: Optional[float] = DRM.parse_rfc2616_date(server_date)
|
94 |
+
if server_date_parsed is None or not isinstance(server_date_parsed, float):
|
95 |
+
raise SkewAdjustmentError(
|
96 |
+
f"Failed to parse server date: {server_date}"
|
97 |
+
) from e
|
98 |
+
client_date = DRM.get_unix_timestamp()
|
99 |
+
DRM.adj_clock_skew_seconds(server_date_parsed - client_date)
|
100 |
+
|
101 |
+
@staticmethod
|
102 |
+
def generate_sec_ms_gec() -> str:
|
103 |
+
"""
|
104 |
+
Generates the Sec-MS-GEC token value.
|
105 |
+
|
106 |
+
This function generates a token value based on the current time in Windows file time format
|
107 |
+
adjusted for clock skew, and rounded down to the nearest 5 minutes. The token is then hashed
|
108 |
+
using SHA256 and returned as an uppercased hex digest.
|
109 |
+
|
110 |
+
Returns:
|
111 |
+
str: The generated Sec-MS-GEC token value.
|
112 |
+
|
113 |
+
See Also:
|
114 |
+
https://github.com/rany2/edge-tts/issues/290#issuecomment-2464956570
|
115 |
+
"""
|
116 |
+
|
117 |
+
# Get the current timestamp in Unix format with clock skew correction
|
118 |
+
ticks = DRM.get_unix_timestamp()
|
119 |
+
|
120 |
+
# Switch to Windows file time epoch (1601-01-01 00:00:00 UTC)
|
121 |
+
ticks += WIN_EPOCH
|
122 |
+
|
123 |
+
# Round down to the nearest 5 minutes (300 seconds)
|
124 |
+
ticks -= ticks % 300
|
125 |
+
|
126 |
+
# Convert the ticks to 100-nanosecond intervals (Windows file time format)
|
127 |
+
ticks *= S_TO_NS / 100
|
128 |
+
|
129 |
+
# Create the string to hash by concatenating the ticks and the trusted client token
|
130 |
+
str_to_hash = f"{ticks:.0f}{TRUSTED_CLIENT_TOKEN}"
|
131 |
+
|
132 |
+
# Compute the SHA256 hash and return the uppercased hex digest
|
133 |
+
return hashlib.sha256(str_to_hash.encode("ascii")).hexdigest().upper()
|
edge_tts/exceptions.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Custom exceptions for the edge-tts package."""
|
2 |
+
|
3 |
+
|
4 |
+
class EdgeTTSException(Exception):
|
5 |
+
"""Base exception for the edge-tts package."""
|
6 |
+
|
7 |
+
|
8 |
+
class UnknownResponse(EdgeTTSException):
|
9 |
+
"""Raised when an unknown response is received from the server."""
|
10 |
+
|
11 |
+
|
12 |
+
class UnexpectedResponse(EdgeTTSException):
|
13 |
+
"""Raised when an unexpected response is received from the server.
|
14 |
+
|
15 |
+
This hasn't happened yet, but it's possible that the server will
|
16 |
+
change its response format in the future."""
|
17 |
+
|
18 |
+
|
19 |
+
class NoAudioReceived(EdgeTTSException):
|
20 |
+
"""Raised when no audio is received from the server."""
|
21 |
+
|
22 |
+
|
23 |
+
class WebSocketError(EdgeTTSException):
|
24 |
+
"""Raised when a WebSocket error occurs."""
|
25 |
+
|
26 |
+
|
27 |
+
class SkewAdjustmentError(EdgeTTSException):
|
28 |
+
"""Raised when an error occurs while adjusting the clock skew."""
|
edge_tts/py.typed
ADDED
File without changes
|
edge_tts/typing.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Custom types for edge-tts."""
|
2 |
+
|
3 |
+
# pylint: disable=too-few-public-methods
|
4 |
+
|
5 |
+
from typing import List
|
6 |
+
|
7 |
+
from typing_extensions import Literal, NotRequired, TypedDict
|
8 |
+
|
9 |
+
|
10 |
+
class TTSChunk(TypedDict):
|
11 |
+
"""TTS chunk data."""
|
12 |
+
|
13 |
+
type: Literal["audio", "WordBoundary"]
|
14 |
+
data: NotRequired[bytes] # only for audio
|
15 |
+
duration: NotRequired[float] # only for WordBoundary
|
16 |
+
offset: NotRequired[float] # only for WordBoundary
|
17 |
+
text: NotRequired[str] # only for WordBoundary
|
18 |
+
|
19 |
+
|
20 |
+
class VoiceTag(TypedDict):
|
21 |
+
"""VoiceTag data."""
|
22 |
+
|
23 |
+
ContentCategories: List[
|
24 |
+
Literal[
|
25 |
+
"Cartoon",
|
26 |
+
"Conversation",
|
27 |
+
"Copilot",
|
28 |
+
"Dialect",
|
29 |
+
"General",
|
30 |
+
"News",
|
31 |
+
"Novel",
|
32 |
+
"Sports",
|
33 |
+
]
|
34 |
+
]
|
35 |
+
VoicePersonalities: List[
|
36 |
+
Literal[
|
37 |
+
"Approachable",
|
38 |
+
"Authentic",
|
39 |
+
"Authority",
|
40 |
+
"Bright",
|
41 |
+
"Caring",
|
42 |
+
"Casual",
|
43 |
+
"Cheerful",
|
44 |
+
"Clear",
|
45 |
+
"Comfort",
|
46 |
+
"Confident",
|
47 |
+
"Considerate",
|
48 |
+
"Conversational",
|
49 |
+
"Cute",
|
50 |
+
"Expressive",
|
51 |
+
"Friendly",
|
52 |
+
"Honest",
|
53 |
+
"Humorous",
|
54 |
+
"Lively",
|
55 |
+
"Passion",
|
56 |
+
"Pleasant",
|
57 |
+
"Positive",
|
58 |
+
"Professional",
|
59 |
+
"Rational",
|
60 |
+
"Reliable",
|
61 |
+
"Sincere",
|
62 |
+
"Sunshine",
|
63 |
+
"Warm",
|
64 |
+
]
|
65 |
+
]
|
66 |
+
|
67 |
+
|
68 |
+
class Voice(TypedDict):
|
69 |
+
"""Voice data."""
|
70 |
+
|
71 |
+
Name: str
|
72 |
+
ShortName: str
|
73 |
+
Gender: Literal["Female", "Male"]
|
74 |
+
Locale: str
|
75 |
+
SuggestedCodec: Literal["audio-24khz-48kbitrate-mono-mp3"]
|
76 |
+
FriendlyName: str
|
77 |
+
Status: Literal["GA"]
|
78 |
+
VoiceTag: VoiceTag
|
79 |
+
|
80 |
+
|
81 |
+
class VoicesManagerVoice(Voice):
|
82 |
+
"""Voice data for VoicesManager."""
|
83 |
+
|
84 |
+
Language: str
|
85 |
+
|
86 |
+
|
87 |
+
class VoicesManagerFind(TypedDict):
|
88 |
+
"""Voice data for VoicesManager.find()."""
|
89 |
+
|
90 |
+
Gender: NotRequired[Literal["Female", "Male"]]
|
91 |
+
Locale: NotRequired[str]
|
92 |
+
Language: NotRequired[str]
|
93 |
+
|
94 |
+
|
95 |
+
class CommunicateState(TypedDict):
|
96 |
+
"""Communicate state data."""
|
97 |
+
|
98 |
+
partial_text: bytes
|
99 |
+
offset_compensation: float
|
100 |
+
last_duration_offset: float
|
101 |
+
stream_was_called: bool
|
edge_tts/util.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Utility functions for the command line interface. Used by the main module."""
|
2 |
+
|
3 |
+
import argparse
|
4 |
+
import asyncio
|
5 |
+
import sys
|
6 |
+
from typing import Optional, TextIO
|
7 |
+
|
8 |
+
from tabulate import tabulate
|
9 |
+
|
10 |
+
from . import Communicate, SubMaker, list_voices
|
11 |
+
from .constants import DEFAULT_VOICE
|
12 |
+
from .data_classes import UtilArgs
|
13 |
+
|
14 |
+
|
15 |
+
async def _print_voices(*, proxy: Optional[str]) -> None:
|
16 |
+
"""Print all available voices."""
|
17 |
+
voices = await list_voices(proxy=proxy)
|
18 |
+
voices = sorted(voices, key=lambda voice: voice["ShortName"])
|
19 |
+
headers = ["Name", "Gender", "ContentCategories", "VoicePersonalities"]
|
20 |
+
table = [
|
21 |
+
[
|
22 |
+
voice["ShortName"],
|
23 |
+
voice["Gender"],
|
24 |
+
", ".join(voice["VoiceTag"]["ContentCategories"]),
|
25 |
+
", ".join(voice["VoiceTag"]["VoicePersonalities"]),
|
26 |
+
]
|
27 |
+
for voice in voices
|
28 |
+
]
|
29 |
+
print(tabulate(table, headers))
|
30 |
+
|
31 |
+
|
32 |
+
async def _run_tts(args: UtilArgs) -> None:
|
33 |
+
"""Run TTS after parsing arguments from command line."""
|
34 |
+
|
35 |
+
try:
|
36 |
+
if sys.stdin.isatty() and sys.stdout.isatty() and not args.write_media:
|
37 |
+
print(
|
38 |
+
"Warning: TTS output will be written to the terminal. "
|
39 |
+
"Use --write-media to write to a file.\n"
|
40 |
+
"Press Ctrl+C to cancel the operation. "
|
41 |
+
"Press Enter to continue.",
|
42 |
+
file=sys.stderr,
|
43 |
+
)
|
44 |
+
input()
|
45 |
+
except KeyboardInterrupt:
|
46 |
+
print("\nOperation canceled.", file=sys.stderr)
|
47 |
+
return
|
48 |
+
|
49 |
+
communicate = Communicate(
|
50 |
+
args.text,
|
51 |
+
args.voice,
|
52 |
+
rate=args.rate,
|
53 |
+
volume=args.volume,
|
54 |
+
pitch=args.pitch,
|
55 |
+
proxy=args.proxy,
|
56 |
+
)
|
57 |
+
submaker = SubMaker()
|
58 |
+
try:
|
59 |
+
audio_file = (
|
60 |
+
open(args.write_media, "wb")
|
61 |
+
if args.write_media is not None and args.write_media != "-"
|
62 |
+
else sys.stdout.buffer
|
63 |
+
)
|
64 |
+
sub_file: Optional[TextIO] = (
|
65 |
+
open(args.write_subtitles, "w", encoding="utf-8")
|
66 |
+
if args.write_subtitles is not None and args.write_subtitles != "-"
|
67 |
+
else None
|
68 |
+
)
|
69 |
+
if sub_file is None and args.write_subtitles == "-":
|
70 |
+
sub_file = sys.stderr
|
71 |
+
|
72 |
+
async for chunk in communicate.stream():
|
73 |
+
if chunk["type"] == "audio":
|
74 |
+
audio_file.write(chunk["data"])
|
75 |
+
elif chunk["type"] == "WordBoundary":
|
76 |
+
submaker.feed(chunk)
|
77 |
+
|
78 |
+
if args.words_in_cue > 0:
|
79 |
+
submaker.merge_cues(args.words_in_cue)
|
80 |
+
|
81 |
+
if sub_file is not None:
|
82 |
+
sub_file.write(submaker.get_srt())
|
83 |
+
finally:
|
84 |
+
if audio_file is not sys.stdout.buffer:
|
85 |
+
audio_file.close()
|
86 |
+
if sub_file is not None and sub_file is not sys.stderr:
|
87 |
+
sub_file.close()
|
88 |
+
|
89 |
+
|
90 |
+
async def amain() -> None:
|
91 |
+
"""Async main function"""
|
92 |
+
parser = argparse.ArgumentParser(
|
93 |
+
description="Text-to-speech using Microsoft Edge's online TTS service."
|
94 |
+
)
|
95 |
+
group = parser.add_mutually_exclusive_group(required=True)
|
96 |
+
group.add_argument("-t", "--text", help="what TTS will say")
|
97 |
+
group.add_argument("-f", "--file", help="same as --text but read from file")
|
98 |
+
parser.add_argument(
|
99 |
+
"-v",
|
100 |
+
"--voice",
|
101 |
+
help=f"voice for TTS. Default: {DEFAULT_VOICE}",
|
102 |
+
default=DEFAULT_VOICE,
|
103 |
+
)
|
104 |
+
group.add_argument(
|
105 |
+
"-l",
|
106 |
+
"--list-voices",
|
107 |
+
help="lists available voices and exits",
|
108 |
+
action="store_true",
|
109 |
+
)
|
110 |
+
parser.add_argument("--rate", help="set TTS rate. Default +0%%.", default="+0%")
|
111 |
+
parser.add_argument("--volume", help="set TTS volume. Default +0%%.", default="+0%")
|
112 |
+
parser.add_argument("--pitch", help="set TTS pitch. Default +0Hz.", default="+0Hz")
|
113 |
+
parser.add_argument(
|
114 |
+
"--words-in-cue",
|
115 |
+
help="number of words in a subtitle cue. Default: 10.",
|
116 |
+
default=10,
|
117 |
+
type=int,
|
118 |
+
)
|
119 |
+
parser.add_argument(
|
120 |
+
"--write-media", help="send media output to file instead of stdout"
|
121 |
+
)
|
122 |
+
parser.add_argument(
|
123 |
+
"--write-subtitles",
|
124 |
+
help="send subtitle output to provided file instead of stderr",
|
125 |
+
)
|
126 |
+
parser.add_argument("--proxy", help="use a proxy for TTS and voice list.")
|
127 |
+
args = parser.parse_args(namespace=UtilArgs())
|
128 |
+
|
129 |
+
if args.list_voices:
|
130 |
+
await _print_voices(proxy=args.proxy)
|
131 |
+
sys.exit(0)
|
132 |
+
|
133 |
+
if args.file is not None:
|
134 |
+
if args.file in ("-", "/dev/stdin"):
|
135 |
+
args.text = sys.stdin.read()
|
136 |
+
else:
|
137 |
+
with open(args.file, "r", encoding="utf-8") as file:
|
138 |
+
args.text = file.read()
|
139 |
+
|
140 |
+
if args.text is not None:
|
141 |
+
await _run_tts(args)
|
142 |
+
|
143 |
+
|
144 |
+
def main() -> None:
|
145 |
+
"""Run the main function using asyncio."""
|
146 |
+
asyncio.run(amain())
|
147 |
+
|
148 |
+
|
149 |
+
if __name__ == "__main__":
|
150 |
+
main()
|
edge_tts/version.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Version information for the edge_tts package."""
|
2 |
+
|
3 |
+
__version__ = "7.0.0"
|
4 |
+
__version_info__ = tuple(int(num) for num in __version__.split("."))
|
edge_tts/voices.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""This module contains functions to list all available voices and a class to find the
|
2 |
+
correct voice based on their attributes."""
|
3 |
+
|
4 |
+
import json
|
5 |
+
import ssl
|
6 |
+
from typing import List, Optional
|
7 |
+
|
8 |
+
import aiohttp
|
9 |
+
import certifi
|
10 |
+
from typing_extensions import Unpack
|
11 |
+
|
12 |
+
from .constants import SEC_MS_GEC_VERSION, VOICE_HEADERS, VOICE_LIST
|
13 |
+
from .drm import DRM
|
14 |
+
from .typing import Voice, VoicesManagerFind, VoicesManagerVoice
|
15 |
+
|
16 |
+
|
17 |
+
async def __list_voices(
|
18 |
+
session: aiohttp.ClientSession, ssl_ctx: ssl.SSLContext, proxy: Optional[str]
|
19 |
+
) -> List[Voice]:
|
20 |
+
"""
|
21 |
+
Private function that makes the request to the voice list URL and parses the
|
22 |
+
JSON response. This function is used by list_voices() and makes it easier to
|
23 |
+
handle client response errors related to clock skew.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
session (aiohttp.ClientSession): The aiohttp session to use for the request.
|
27 |
+
ssl_ctx (ssl.SSLContext): The SSL context to use for the request.
|
28 |
+
proxy (Optional[str]): The proxy to use for the request.
|
29 |
+
|
30 |
+
Returns:
|
31 |
+
List[Voice]: A list of voices and their attributes.
|
32 |
+
"""
|
33 |
+
async with session.get(
|
34 |
+
f"{VOICE_LIST}&Sec-MS-GEC={DRM.generate_sec_ms_gec()}"
|
35 |
+
f"&Sec-MS-GEC-Version={SEC_MS_GEC_VERSION}",
|
36 |
+
headers=VOICE_HEADERS,
|
37 |
+
proxy=proxy,
|
38 |
+
ssl=ssl_ctx,
|
39 |
+
raise_for_status=True,
|
40 |
+
) as url:
|
41 |
+
data: List[Voice] = json.loads(await url.text())
|
42 |
+
|
43 |
+
for voice in data:
|
44 |
+
# Remove leading and trailing whitespace from categories and personalities.
|
45 |
+
# This has only happened in one case with the zh-CN-YunjianNeural voice
|
46 |
+
# where there was a leading space in one of the categories.
|
47 |
+
voice["VoiceTag"]["ContentCategories"] = [
|
48 |
+
category.strip() # type: ignore
|
49 |
+
for category in voice["VoiceTag"]["ContentCategories"]
|
50 |
+
]
|
51 |
+
voice["VoiceTag"]["VoicePersonalities"] = [
|
52 |
+
personality.strip() # type: ignore
|
53 |
+
for personality in voice["VoiceTag"]["VoicePersonalities"]
|
54 |
+
]
|
55 |
+
|
56 |
+
return data
|
57 |
+
|
58 |
+
|
59 |
+
async def list_voices(
|
60 |
+
*, connector: Optional[aiohttp.BaseConnector] = None, proxy: Optional[str] = None
|
61 |
+
) -> List[Voice]:
|
62 |
+
"""
|
63 |
+
List all available voices and their attributes.
|
64 |
+
|
65 |
+
This pulls data from the URL used by Microsoft Edge to return a list of
|
66 |
+
all available voices.
|
67 |
+
|
68 |
+
Args:
|
69 |
+
connector (Optional[aiohttp.BaseConnector]): The connector to use for the request.
|
70 |
+
proxy (Optional[str]): The proxy to use for the request.
|
71 |
+
|
72 |
+
Returns:
|
73 |
+
List[Voice]: A list of voices and their attributes.
|
74 |
+
"""
|
75 |
+
ssl_ctx = ssl.create_default_context(cafile=certifi.where())
|
76 |
+
async with aiohttp.ClientSession(connector=connector, trust_env=True) as session:
|
77 |
+
try:
|
78 |
+
data = await __list_voices(session, ssl_ctx, proxy)
|
79 |
+
except aiohttp.ClientResponseError as e:
|
80 |
+
if e.status != 403:
|
81 |
+
raise
|
82 |
+
|
83 |
+
DRM.handle_client_response_error(e)
|
84 |
+
data = await __list_voices(session, ssl_ctx, proxy)
|
85 |
+
return data
|
86 |
+
|
87 |
+
|
88 |
+
class VoicesManager:
|
89 |
+
"""
|
90 |
+
A class to find the correct voice based on their attributes.
|
91 |
+
"""
|
92 |
+
|
93 |
+
def __init__(self) -> None:
|
94 |
+
self.voices: List[VoicesManagerVoice] = []
|
95 |
+
self.called_create: bool = False
|
96 |
+
|
97 |
+
@classmethod
|
98 |
+
async def create(
|
99 |
+
cls, custom_voices: Optional[List[Voice]] = None
|
100 |
+
) -> "VoicesManager":
|
101 |
+
"""
|
102 |
+
Creates a VoicesManager object and populates it with all available voices.
|
103 |
+
"""
|
104 |
+
self = VoicesManager()
|
105 |
+
voices = await list_voices() if custom_voices is None else custom_voices
|
106 |
+
self.voices = [
|
107 |
+
{**voice, "Language": voice["Locale"].split("-")[0]} for voice in voices
|
108 |
+
]
|
109 |
+
self.called_create = True
|
110 |
+
return self
|
111 |
+
|
112 |
+
def find(self, **kwargs: Unpack[VoicesManagerFind]) -> List[VoicesManagerVoice]:
|
113 |
+
"""
|
114 |
+
Finds all matching voices based on the provided attributes.
|
115 |
+
"""
|
116 |
+
if not self.called_create:
|
117 |
+
raise RuntimeError(
|
118 |
+
"VoicesManager.find() called before VoicesManager.create()"
|
119 |
+
)
|
120 |
+
|
121 |
+
matching_voices = [
|
122 |
+
voice for voice in self.voices if kwargs.items() <= voice.items()
|
123 |
+
]
|
124 |
+
return matching_voices
|