Spaces:

usuario101
/

hablandose_bien3

Running

App Files Files Community

usuario101 commited on Mar 4

Commit

1a0d463

verified ·

1 Parent(s): b47a51b

Upload 15 files

Browse files

Files changed (15) hide show

.gitattributes +35 -0
README.md +12 -0
app.py +2 -0
edge_tts/__init__.py +13 -0
edge_tts/__main__.py +6 -0
edge_tts/communicate.py +552 -0
edge_tts/constants.py +37 -0
edge_tts/data_classes.py +89 -0
edge_tts/drm.py +133 -0
edge_tts/exceptions.py +28 -0
edge_tts/py.typed +0 -0
edge_tts/typing.py +101 -0
edge_tts/util.py +150 -0
edge_tts/version.py +4 -0
edge_tts/voices.py +124 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: Probando No Funciona20
+emoji: 👁
+colorFrom: yellow
+colorTo: yellow
+sdk: gradio
+sdk_version: 5.20.0
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ import os
2	+ exec(os.environ.get('APP'))

edge_tts/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from . import exceptions
+from .communicate import Communicate
+from .version import __version__, __version_info__
+from .voices import VoicesManager, list_voices
+__all__ = [
+    "Communicate",
+    "exceptions",
+    "__version__",
+    "__version_info__",
+    "VoicesManager",
+    "list_voices",
+]

edge_tts/__main__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Main entrypoint for the edge-tts package."""
+from .util import main
+if __name__ == "__main__":
+    main()

edge_tts/communicate.py ADDED Viewed

	@@ -0,0 +1,552 @@

+"""Communicate with the service. Only the Communicate class should be used by
+end-users. The other classes and functions are for internal use only."""
+import asyncio
+import concurrent.futures
+import json
+import ssl
+import time
+import uuid
+from contextlib import nullcontext
+from io import TextIOWrapper
+from queue import Queue
+from typing import (
+    AsyncGenerator,
+    ContextManager,
+    Dict,
+    Generator,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
+from xml.sax.saxutils import escape
+import aiohttp
+import certifi
+from .constants import DEFAULT_VOICE, SEC_MS_GEC_VERSION, WSS_HEADERS, WSS_URL
+from .data_classes import TTSConfig
+from .drm import DRM
+from .exceptions import (
+    NoAudioReceived,
+    UnexpectedResponse,
+    UnknownResponse,
+    WebSocketError,
+)
+from .typing import CommunicateState, TTSChunk
+def get_headers_and_data(
+    data: bytes, header_length: int
+) -> Tuple[Dict[bytes, bytes], bytes]:
+    """
+    Returns the headers and data from the given data.
+    Args:
+        data (bytes): The data to be parsed.
+        header_length (int): The length of the header.
+    Returns:
+        tuple: The headers and data to be used in the request.
+    """
+    if not isinstance(data, bytes):
+        raise TypeError("data must be bytes")
+    headers = {}
+    for line in data[:header_length].split(b"\r\n"):
+        key, value = line.split(b":", 1)
+        headers[key] = value
+    return headers, data[header_length + 2 :]
+def remove_incompatible_characters(string: Union[str, bytes]) -> str:
+    """
+    The service does not support a couple character ranges.
+    Most important being the vertical tab character which is
+    commonly present in OCR-ed PDFs. Not doing this will
+    result in an error from the service.
+    Args:
+        string (str or bytes): The string to be cleaned.
+    Returns:
+        str: The cleaned string.
+    """
+    if isinstance(string, bytes):
+        string = string.decode("utf-8")
+    if not isinstance(string, str):
+        raise TypeError("string must be str or bytes")
+    chars: List[str] = list(string)
+    for idx, char in enumerate(chars):
+        code: int = ord(char)
+        if (0 <= code <= 8) or (11 <= code <= 12) or (14 <= code <= 31):
+            chars[idx] = " "
+    return "".join(chars)
+def connect_id() -> str:
+    """
+    Returns a UUID without dashes.
+    Returns:
+        str: A UUID without dashes.
+    """
+    return str(uuid.uuid4()).replace("-", "")
+def split_text_by_byte_length(
+    text: Union[str, bytes], byte_length: int
+) -> Generator[bytes, None, None]:
+    """
+    Splits a string into a list of strings of a given byte length
+    while attempting to keep words together. This function assumes
+    text will be inside of an XML tag.
+    Args:
+        text (str or bytes): The string to be split. If bytes, it must be UTF-8 encoded.
+        byte_length (int): The maximum byte length of each string in the list.
+    Yield:
+        bytes: The next string in the list.
+    """
+    if isinstance(text, str):
+        text = text.encode("utf-8")
+    if not isinstance(text, bytes):
+        raise TypeError("text must be str or bytes")
+    if byte_length <= 0:
+        raise ValueError("byte_length must be greater than 0")
+    while len(text) > byte_length:
+        # Find the last space in the string
+        split_at = text.rfind(b" ", 0, byte_length)
+        # If no space found, split_at is byte_length
+        split_at = split_at if split_at != -1 else byte_length
+        # Verify all & are terminated with a ;
+        while b"&" in text[:split_at]:
+            ampersand_index = text.rindex(b"&", 0, split_at)
+            if text.find(b";", ampersand_index, split_at) != -1:
+                break
+            split_at = ampersand_index - 1
+            if split_at < 0:
+                raise ValueError("Maximum byte length is too small or invalid text")
+            if split_at == 0:
+                break
+        # Append the string to the list
+        new_text = text[:split_at].strip()
+        if new_text:
+            yield new_text
+        if split_at == 0:
+            split_at = 1
+        text = text[split_at:]
+    new_text = text.strip()
+    if new_text:
+        yield new_text
+def mkssml(tc: TTSConfig, escaped_text: Union[str, bytes]) -> str:
+    """
+    Creates a SSML string from the given parameters.
+    Args:
+        tc (TTSConfig): The TTS configuration.
+        escaped_text (str or bytes): The escaped text. If bytes, it must be UTF-8 encoded.
+    Returns:
+        str: The SSML string.
+    """
+    if isinstance(escaped_text, bytes):
+        escaped_text = escaped_text.decode("utf-8")
+    return (
+        "<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'>"
+        f"<voice name='{tc.voice}'>"
+        f"<prosody pitch='{tc.pitch}' rate='{tc.rate}' volume='{tc.volume}'>"
+        f"{escaped_text}"
+        "</prosody>"
+        "</voice>"
+        "</speak>"
+    )
+def date_to_string() -> str:
+    """
+    Return Javascript-style date string.
+    Returns:
+        str: Javascript-style date string.
+    """
+    # %Z is not what we want, but it's the only way to get the timezone
+    # without having to use a library. We'll just use UTC and hope for the best.
+    # For example, right now %Z would return EEST when we need it to return
+    # Eastern European Summer Time.
+    return time.strftime(
+        "%a %b %d %Y %H:%M:%S GMT+0000 (Coordinated Universal Time)", time.gmtime()
+    )
+def ssml_headers_plus_data(request_id: str, timestamp: str, ssml: str) -> str:
+    """
+    Returns the headers and data to be used in the request.
+    Returns:
+        str: The headers and data to be used in the request.
+    """
+    return (
+        f"X-RequestId:{request_id}\r\n"
+        "Content-Type:application/ssml+xml\r\n"
+        f"X-Timestamp:{timestamp}Z\r\n"  # This is not a mistake, Microsoft Edge bug.
+        "Path:ssml\r\n\r\n"
+        f"{ssml}"
+    )
+def calc_max_mesg_size(tts_config: TTSConfig) -> int:
+    """Calculates the maximum message size for the given voice, rate, and volume.
+    Returns:
+        int: The maximum message size.
+    """
+    websocket_max_size: int = 2**16
+    overhead_per_message: int = (
+        len(
+            ssml_headers_plus_data(
+                connect_id(),
+                date_to_string(),
+                mkssml(tts_config, ""),
+            )
+        )
+        + 50  # margin of error
+    )
+    return websocket_max_size - overhead_per_message
+class Communicate:
+    """
+    Communicate with the service.
+    """
+    def __init__(
+        self,
+        text: str,
+        voice: str = DEFAULT_VOICE,
+        *,
+        rate: str = "+0%",
+        volume: str = "+0%",
+        pitch: str = "+0Hz",
+        connector: Optional[aiohttp.BaseConnector] = None,
+        proxy: Optional[str] = None,
+        connect_timeout: Optional[int] = 10,
+        receive_timeout: Optional[int] = 60,
+    ):
+        # Validate TTS settings and store the TTSConfig object.
+        self.tts_config = TTSConfig(voice, rate, volume, pitch)
+        # Validate the text parameter.
+        if not isinstance(text, str):
+            raise TypeError("text must be str")
+        # Split the text into multiple strings and store them.
+        self.texts = split_text_by_byte_length(
+            escape(remove_incompatible_characters(text)),
+            calc_max_mesg_size(self.tts_config),
+        )
+        # Validate the proxy parameter.
+        if proxy is not None and not isinstance(proxy, str):
+            raise TypeError("proxy must be str")
+        self.proxy: Optional[str] = proxy
+        # Validate the timeout parameters.
+        if not isinstance(connect_timeout, int):
+            raise TypeError("connect_timeout must be int")
+        if not isinstance(receive_timeout, int):
+            raise TypeError("receive_timeout must be int")
+        self.session_timeout = aiohttp.ClientTimeout(
+            total=None,
+            connect=None,
+            sock_connect=connect_timeout,
+            sock_read=receive_timeout,
+        )
+        # Validate the connector parameter.
+        if connector is not None and not isinstance(connector, aiohttp.BaseConnector):
+            raise TypeError("connector must be aiohttp.BaseConnector")
+        self.connector: Optional[aiohttp.BaseConnector] = connector
+        # Store current state of TTS.
+        self.state: CommunicateState = {
+            "partial_text": b"",
+            "offset_compensation": 0,
+            "last_duration_offset": 0,
+            "stream_was_called": False,
+        }
+    def __parse_metadata(self, data: bytes) -> TTSChunk:
+        for meta_obj in json.loads(data)["Metadata"]:
+            meta_type = meta_obj["Type"]
+            if meta_type == "WordBoundary":
+                current_offset = (
+                    meta_obj["Data"]["Offset"] + self.state["offset_compensation"]
+                )
+                current_duration = meta_obj["Data"]["Duration"]
+                return {
+                    "type": meta_type,
+                    "offset": current_offset,
+                    "duration": current_duration,
+                    "text": meta_obj["Data"]["text"]["Text"],
+                }
+            if meta_type in ("SessionEnd",):
+                continue
+            raise UnknownResponse(f"Unknown metadata type: {meta_type}")
+        raise UnexpectedResponse("No WordBoundary metadata found")
+    async def __stream(self) -> AsyncGenerator[TTSChunk, None]:
+        async def send_command_request() -> None:
+            """Sends the command request to the service."""
+            await websocket.send_str(
+                f"X-Timestamp:{date_to_string()}\r\n"
+                "Content-Type:application/json; charset=utf-8\r\n"
+                "Path:speech.config\r\n\r\n"
+                '{"context":{"synthesis":{"audio":{"metadataoptions":{'
+                '"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"true"},'
+                '"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
+                "}}}}\r\n"
+            )
+        async def send_ssml_request() -> None:
+            """Sends the SSML request to the service."""
+            await websocket.send_str(
+                ssml_headers_plus_data(
+                    connect_id(),
+                    date_to_string(),
+                    mkssml(
+                        self.tts_config,
+                        self.state["partial_text"],
+                    ),
+                )
+            )
+        # audio_was_received indicates whether we have received audio data
+        # from the websocket. This is so we can raise an exception if we
+        # don't receive any audio data.
+        audio_was_received = False
+        # Create a new connection to the service.
+        ssl_ctx = ssl.create_default_context(cafile=certifi.where())
+        async with aiohttp.ClientSession(
+            connector=self.connector,
+            trust_env=True,
+            timeout=self.session_timeout,
+        ) as session, session.ws_connect(
+            f"{WSS_URL}&Sec-MS-GEC={DRM.generate_sec_ms_gec()}"
+            f"&Sec-MS-GEC-Version={SEC_MS_GEC_VERSION}"
+            f"&ConnectionId={connect_id()}",
+            compress=15,
+            proxy=self.proxy,
+            headers=WSS_HEADERS,
+            ssl=ssl_ctx,
+        ) as websocket:
+            await send_command_request()
+            await send_ssml_request()
+            async for received in websocket:
+                if received.type == aiohttp.WSMsgType.TEXT:
+                    encoded_data: bytes = received.data.encode("utf-8")
+                    parameters, data = get_headers_and_data(
+                        encoded_data, encoded_data.find(b"\r\n\r\n")
+                    )
+                    path = parameters.get(b"Path", None)
+                    if path == b"audio.metadata":
+                        # Parse the metadata and yield it.
+                        parsed_metadata = self.__parse_metadata(data)
+                        yield parsed_metadata
+                        # Update the last duration offset for use by the next SSML request.
+                        self.state["last_duration_offset"] = (
+                            parsed_metadata["offset"] + parsed_metadata["duration"]
+                        )
+                    elif path == b"turn.end":
+                        # Update the offset compensation for the next SSML request.
+                        self.state["offset_compensation"] = self.state[
+                            "last_duration_offset"
+                        ]
+                        # Use average padding typically added by the service
+                        # to the end of the audio data. This seems to work pretty
+                        # well for now, but we might ultimately need to use a
+                        # more sophisticated method like using ffmpeg to get
+                        # the actual duration of the audio data.
+                        self.state["offset_compensation"] += 8_750_000
+                        # Exit the loop so we can send the next SSML request.
+                        break
+                    elif path not in (b"response", b"turn.start"):
+                        raise UnknownResponse("Unknown path received")
+                elif received.type == aiohttp.WSMsgType.BINARY:
+                    # Message is too short to contain header length.
+                    if len(received.data) < 2:
+                        raise UnexpectedResponse(
+                            "We received a binary message, but it is missing the header length."
+                        )
+                    # The first two bytes of the binary message contain the header length.
+                    header_length = int.from_bytes(received.data[:2], "big")
+                    if header_length > len(received.data):
+                        raise UnexpectedResponse(
+                            "The header length is greater than the length of the data."
+                        )
+                    # Parse the headers and data from the binary message.
+                    parameters, data = get_headers_and_data(
+                        received.data, header_length
+                    )
+                    # Check if the path is audio.
+                    if parameters.get(b"Path") != b"audio":
+                        raise UnexpectedResponse(
+                            "Received binary message, but the path is not audio."
+                        )
+                    # At termination of the stream, the service sends a binary message
+                    # with no Content-Type; this is expected. What is not expected is for
+                    # an MPEG audio stream to be sent with no data.
+                    content_type = parameters.get(b"Content-Type", None)
+                    if content_type not in [b"audio/mpeg", None]:
+                        raise UnexpectedResponse(
+                            "Received binary message, but with an unexpected Content-Type."
+                        )
+                    # We only allow no Content-Type if there is no data.
+                    if content_type is None:
+                        if len(data) == 0:
+                            continue
+                        # If the data is not empty, then we need to raise an exception.
+                        raise UnexpectedResponse(
+                            "Received binary message with no Content-Type, but with data."
+                        )
+                    # If the data is empty now, then we need to raise an exception.
+                    if len(data) == 0:
+                        raise UnexpectedResponse(
+                            "Received binary message, but it is missing the audio data."
+                        )
+                    # Yield the audio data.
+                    audio_was_received = True
+                    yield {"type": "audio", "data": data}
+                elif received.type == aiohttp.WSMsgType.ERROR:
+                    raise WebSocketError(
+                        received.data if received.data else "Unknown error"
+                    )
+            if not audio_was_received:
+                raise NoAudioReceived(
+                    "No audio was received. Please verify that your parameters are correct."
+                )
+    async def stream(
+        self,
+    ) -> AsyncGenerator[TTSChunk, None]:
+        """
+        Streams audio and metadata from the service.
+        Raises:
+            NoAudioReceived: If no audio is received from the service.
+            UnexpectedResponse: If the response from the service is unexpected.
+            UnknownResponse: If the response from the service is unknown.
+            WebSocketError: If there is an error with the websocket.
+        """
+        # Check if stream was called before.
+        if self.state["stream_was_called"]:
+            raise RuntimeError("stream can only be called once.")
+        self.state["stream_was_called"] = True
+        # Stream the audio and metadata from the service.
+        for self.state["partial_text"] in self.texts:
+            try:
+                async for message in self.__stream():
+                    yield message
+            except aiohttp.ClientResponseError as e:
+                if e.status != 403:
+                    raise
+                DRM.handle_client_response_error(e)
+                async for message in self.__stream():
+                    yield message
+    async def save(
+        self,
+        audio_fname: Union[str, bytes],
+        metadata_fname: Optional[Union[str, bytes]] = None,
+    ) -> None:
+        """
+        Save the audio and metadata to the specified files.
+        """
+        metadata: Union[TextIOWrapper, ContextManager[None]] = (
+            open(metadata_fname, "w", encoding="utf-8")
+            if metadata_fname is not None
+            else nullcontext()
+        )
+        with metadata, open(audio_fname, "wb") as audio:
+            async for message in self.stream():
+                if message["type"] == "audio":
+                    audio.write(message["data"])
+                elif (
+                    isinstance(metadata, TextIOWrapper)
+                    and message["type"] == "WordBoundary"
+                ):
+                    json.dump(message, metadata)
+                    metadata.write("\n")
+    def stream_sync(self) -> Generator[TTSChunk, None, None]:
+        """Synchronous interface for async stream method"""
+        def fetch_async_items(queue: Queue) -> None:  # type: ignore
+            async def get_items() -> None:
+                async for item in self.stream():
+                    queue.put(item)
+                queue.put(None)
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            loop.run_until_complete(get_items())
+            loop.close()
+        queue: Queue = Queue()  # type: ignore
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            executor.submit(fetch_async_items, queue)
+            while True:
+                item = queue.get()
+                if item is None:
+                    break
+                yield item
+    def save_sync(
+        self,
+        audio_fname: Union[str, bytes],
+        metadata_fname: Optional[Union[str, bytes]] = None,
+    ) -> None:
+        """Synchronous interface for async save method."""
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            future = executor.submit(
+                asyncio.run, self.save(audio_fname, metadata_fname)
+            )
+            future.result()

edge_tts/constants.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""Constants for the edge_tts package."""
+BASE_URL = "speech.platform.bing.com/consumer/speech/synthesize/readaloud"
+TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4"
+WSS_URL = f"wss://{BASE_URL}/edge/v1?TrustedClientToken={TRUSTED_CLIENT_TOKEN}"
+VOICE_LIST = f"https://{BASE_URL}/voices/list?trustedclienttoken={TRUSTED_CLIENT_TOKEN}"
+DEFAULT_VOICE = "en-US-EmmaMultilingualNeural"
+CHROMIUM_FULL_VERSION = "130.0.2849.68"
+CHROMIUM_MAJOR_VERSION = CHROMIUM_FULL_VERSION.split(".", maxsplit=1)[0]
+SEC_MS_GEC_VERSION = f"1-{CHROMIUM_FULL_VERSION}"
+BASE_HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+    f" (KHTML, like Gecko) Chrome/{CHROMIUM_MAJOR_VERSION}.0.0.0 Safari/537.36"
+    f" Edg/{CHROMIUM_MAJOR_VERSION}.0.0.0",
+    "Accept-Encoding": "gzip, deflate, br",
+    "Accept-Language": "en-US,en;q=0.9",
+}
+WSS_HEADERS = {
+    "Pragma": "no-cache",
+    "Cache-Control": "no-cache",
+    "Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
+}
+WSS_HEADERS.update(BASE_HEADERS)
+VOICE_HEADERS = {
+    "Authority": "speech.platform.bing.com",
+    "Sec-CH-UA": f'" Not;A Brand";v="99", "Microsoft Edge";v="{CHROMIUM_MAJOR_VERSION}",'
+    f' "Chromium";v="{CHROMIUM_MAJOR_VERSION}"',
+    "Sec-CH-UA-Mobile": "?0",
+    "Accept": "*/*",
+    "Sec-Fetch-Site": "none",
+    "Sec-Fetch-Mode": "cors",
+    "Sec-Fetch-Dest": "empty",
+}
+VOICE_HEADERS.update(BASE_HEADERS)

edge_tts/data_classes.py ADDED Viewed

	@@ -0,0 +1,89 @@

+"""Data models for edge-tts."""
+# pylint: disable=too-few-public-methods
+import argparse
+import re
+from dataclasses import dataclass
+@dataclass
+class TTSConfig:
+    """
+    Represents the internal TTS configuration for edge-tts's Communicate class.
+    """
+    voice: str
+    rate: str
+    volume: str
+    pitch: str
+    @staticmethod
+    def validate_string_param(param_name: str, param_value: str, pattern: str) -> str:
+        """
+        Validates the given string parameter based on type and pattern.
+        Args:
+            param_name (str): The name of the parameter.
+            param_value (str): The value of the parameter.
+            pattern (str): The pattern to validate the parameter against.
+        Returns:
+            str: The validated parameter.
+        """
+        if not isinstance(param_value, str):
+            raise TypeError(f"{param_name} must be str")
+        if re.match(pattern, param_value) is None:
+            raise ValueError(f"Invalid {param_name} '{param_value}'.")
+        return param_value
+    def __post_init__(self) -> None:
+        """
+        Validates the TTSConfig object after initialization.
+        """
+        # Possible values for voice are:
+        # - Microsoft Server Speech Text to Speech Voice (cy-GB, NiaNeural)
+        # - cy-GB-NiaNeural
+        # - fil-PH-AngeloNeural
+        # Always send the first variant as that is what Microsoft Edge does.
+        if not isinstance(self.voice, str):
+            raise TypeError("voice must be str")
+        match = re.match(r"^([a-z]{2,})-([A-Z]{2,})-(.+Neural)$", self.voice)
+        if match is not None:
+            lang = match.group(1)
+            region = match.group(2)
+            name = match.group(3)
+            if name.find("-") != -1:
+                region = region + "-" + name[: name.find("-")]
+                name = name[name.find("-") + 1 :]
+            self.voice = (
+                "Microsoft Server Speech Text to Speech Voice"
+                + f" ({lang}-{region}, {name})"
+            )
+        # Validate the rate, volume, and pitch parameters.
+        self.validate_string_param(
+            "voice",
+            self.voice,
+            r"^Microsoft Server Speech Text to Speech Voice \(.+,.+\)$",
+        )
+        self.validate_string_param("rate", self.rate, r"^[+-]\d+%$")
+        self.validate_string_param("volume", self.volume, r"^[+-]\d+%$")
+        self.validate_string_param("pitch", self.pitch, r"^[+-]\d+Hz$")
+class UtilArgs(argparse.Namespace):
+    """CLI arguments."""
+    text: str
+    file: str
+    voice: str
+    list_voices: bool
+    rate: str
+    volume: str
+    pitch: str
+    words_in_cue: int
+    write_media: str
+    write_subtitles: str
+    proxy: str

edge_tts/drm.py ADDED Viewed

	@@ -0,0 +1,133 @@

+"""DRM module is used to handle DRM operations with clock skew correction.
+Currently the only DRM operation is generating the Sec-MS-GEC token value
+used in all API requests to Microsoft Edge's online text-to-speech service."""
+import hashlib
+from datetime import datetime as dt
+from datetime import timezone as tz
+from typing import Optional
+import aiohttp
+from .constants import TRUSTED_CLIENT_TOKEN
+from .exceptions import SkewAdjustmentError
+WIN_EPOCH = 11644473600
+S_TO_NS = 1e9
+class DRM:
+    """
+    Class to handle DRM operations with clock skew correction.
+    """
+    clock_skew_seconds: float = 0.0
+    @staticmethod
+    def adj_clock_skew_seconds(skew_seconds: float) -> None:
+        """
+        Adjust the clock skew in seconds in case the system clock is off.
+        This method updates the `clock_skew_seconds` attribute of the DRM class
+        to the specified number of seconds.
+        Args:
+            skew_seconds (float): The number of seconds to adjust the clock skew to.
+        Returns:
+            None
+        """
+        DRM.clock_skew_seconds += skew_seconds
+    @staticmethod
+    def get_unix_timestamp() -> float:
+        """
+        Gets the current timestamp in Unix format with clock skew correction.
+        Returns:
+            float: The current timestamp in Unix format with clock skew correction.
+        """
+        return dt.now(tz.utc).timestamp() + DRM.clock_skew_seconds
+    @staticmethod
+    def parse_rfc2616_date(date: str) -> Optional[float]:
+        """
+        Parses an RFC 2616 date string into a Unix timestamp.
+        This function parses an RFC 2616 date string into a Unix timestamp.
+        Args:
+            date (str): RFC 2616 date string to parse.
+        Returns:
+            Optional[float]: Unix timestamp of the parsed date string, or None if parsing failed.
+        """
+        try:
+            return (
+                dt.strptime(date, "%a, %d %b %Y %H:%M:%S %Z")
+                .replace(tzinfo=tz.utc)
+                .timestamp()
+            )
+        except ValueError:
+            return None
+    @staticmethod
+    def handle_client_response_error(e: aiohttp.ClientResponseError) -> None:
+        """
+        Handle a client response error.
+        This method adjusts the clock skew based on the server date in the response headers
+        and raises a SkewAdjustmentError if the server date is missing or invalid.
+        Args:
+            e (Exception): The client response error to handle.
+        Returns:
+            None
+        """
+        if e.headers is None:
+            raise SkewAdjustmentError("No server date in headers.") from e
+        server_date: Optional[str] = e.headers.get("Date", None)
+        if server_date is None or not isinstance(server_date, str):
+            raise SkewAdjustmentError("No server date in headers.") from e
+        server_date_parsed: Optional[float] = DRM.parse_rfc2616_date(server_date)
+        if server_date_parsed is None or not isinstance(server_date_parsed, float):
+            raise SkewAdjustmentError(
+                f"Failed to parse server date: {server_date}"
+            ) from e
+        client_date = DRM.get_unix_timestamp()
+        DRM.adj_clock_skew_seconds(server_date_parsed - client_date)
+    @staticmethod
+    def generate_sec_ms_gec() -> str:
+        """
+        Generates the Sec-MS-GEC token value.
+        This function generates a token value based on the current time in Windows file time format
+        adjusted for clock skew, and rounded down to the nearest 5 minutes. The token is then hashed
+        using SHA256 and returned as an uppercased hex digest.
+        Returns:
+            str: The generated Sec-MS-GEC token value.
+        See Also:
+            https://github.com/rany2/edge-tts/issues/290#issuecomment-2464956570
+        """
+        # Get the current timestamp in Unix format with clock skew correction
+        ticks = DRM.get_unix_timestamp()
+        # Switch to Windows file time epoch (1601-01-01 00:00:00 UTC)
+        ticks += WIN_EPOCH
+        # Round down to the nearest 5 minutes (300 seconds)
+        ticks -= ticks % 300
+        # Convert the ticks to 100-nanosecond intervals (Windows file time format)
+        ticks *= S_TO_NS / 100
+        # Create the string to hash by concatenating the ticks and the trusted client token
+        str_to_hash = f"{ticks:.0f}{TRUSTED_CLIENT_TOKEN}"
+        # Compute the SHA256 hash and return the uppercased hex digest
+        return hashlib.sha256(str_to_hash.encode("ascii")).hexdigest().upper()

edge_tts/exceptions.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""Custom exceptions for the edge-tts package."""
+class EdgeTTSException(Exception):
+    """Base exception for the edge-tts package."""
+class UnknownResponse(EdgeTTSException):
+    """Raised when an unknown response is received from the server."""
+class UnexpectedResponse(EdgeTTSException):
+    """Raised when an unexpected response is received from the server.
+    This hasn't happened yet, but it's possible that the server will
+    change its response format in the future."""
+class NoAudioReceived(EdgeTTSException):
+    """Raised when no audio is received from the server."""
+class WebSocketError(EdgeTTSException):
+    """Raised when a WebSocket error occurs."""
+class SkewAdjustmentError(EdgeTTSException):
+    """Raised when an error occurs while adjusting the clock skew."""

edge_tts/py.typed ADDED Viewed

File without changes

edge_tts/typing.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""Custom types for edge-tts."""
+# pylint: disable=too-few-public-methods
+from typing import List
+from typing_extensions import Literal, NotRequired, TypedDict
+class TTSChunk(TypedDict):
+    """TTS chunk data."""
+    type: Literal["audio", "WordBoundary"]
+    data: NotRequired[bytes]  # only for audio
+    duration: NotRequired[float]  # only for WordBoundary
+    offset: NotRequired[float]  # only for WordBoundary
+    text: NotRequired[str]  # only for WordBoundary
+class VoiceTag(TypedDict):
+    """VoiceTag data."""
+    ContentCategories: List[
+        Literal[
+            "Cartoon",
+            "Conversation",
+            "Copilot",
+            "Dialect",
+            "General",
+            "News",
+            "Novel",
+            "Sports",
+        ]
+    ]
+    VoicePersonalities: List[
+        Literal[
+            "Approachable",
+            "Authentic",
+            "Authority",
+            "Bright",
+            "Caring",
+            "Casual",
+            "Cheerful",
+            "Clear",
+            "Comfort",
+            "Confident",
+            "Considerate",
+            "Conversational",
+            "Cute",
+            "Expressive",
+            "Friendly",
+            "Honest",
+            "Humorous",
+            "Lively",
+            "Passion",
+            "Pleasant",
+            "Positive",
+            "Professional",
+            "Rational",
+            "Reliable",
+            "Sincere",
+            "Sunshine",
+            "Warm",
+        ]
+    ]
+class Voice(TypedDict):
+    """Voice data."""
+    Name: str
+    ShortName: str
+    Gender: Literal["Female", "Male"]
+    Locale: str
+    SuggestedCodec: Literal["audio-24khz-48kbitrate-mono-mp3"]
+    FriendlyName: str
+    Status: Literal["GA"]
+    VoiceTag: VoiceTag
+class VoicesManagerVoice(Voice):
+    """Voice data for VoicesManager."""
+    Language: str
+class VoicesManagerFind(TypedDict):
+    """Voice data for VoicesManager.find()."""
+    Gender: NotRequired[Literal["Female", "Male"]]
+    Locale: NotRequired[str]
+    Language: NotRequired[str]
+class CommunicateState(TypedDict):
+    """Communicate state data."""
+    partial_text: bytes
+    offset_compensation: float
+    last_duration_offset: float
+    stream_was_called: bool

edge_tts/util.py ADDED Viewed

	@@ -0,0 +1,150 @@

+"""Utility functions for the command line interface. Used by the main module."""
+import argparse
+import asyncio
+import sys
+from typing import Optional, TextIO
+from tabulate import tabulate
+from . import Communicate, SubMaker, list_voices
+from .constants import DEFAULT_VOICE
+from .data_classes import UtilArgs
+async def _print_voices(*, proxy: Optional[str]) -> None:
+    """Print all available voices."""
+    voices = await list_voices(proxy=proxy)
+    voices = sorted(voices, key=lambda voice: voice["ShortName"])
+    headers = ["Name", "Gender", "ContentCategories", "VoicePersonalities"]
+    table = [
+        [
+            voice["ShortName"],
+            voice["Gender"],
+            ", ".join(voice["VoiceTag"]["ContentCategories"]),
+            ", ".join(voice["VoiceTag"]["VoicePersonalities"]),
+        ]
+        for voice in voices
+    ]
+    print(tabulate(table, headers))
+async def _run_tts(args: UtilArgs) -> None:
+    """Run TTS after parsing arguments from command line."""
+    try:
+        if sys.stdin.isatty() and sys.stdout.isatty() and not args.write_media:
+            print(
+                "Warning: TTS output will be written to the terminal. "
+                "Use --write-media to write to a file.\n"
+                "Press Ctrl+C to cancel the operation. "
+                "Press Enter to continue.",
+                file=sys.stderr,
+            )
+            input()
+    except KeyboardInterrupt:
+        print("\nOperation canceled.", file=sys.stderr)
+        return
+    communicate = Communicate(
+        args.text,
+        args.voice,
+        rate=args.rate,
+        volume=args.volume,
+        pitch=args.pitch,
+        proxy=args.proxy,
+    )
+    submaker = SubMaker()
+    try:
+        audio_file = (
+            open(args.write_media, "wb")
+            if args.write_media is not None and args.write_media != "-"
+            else sys.stdout.buffer
+        )
+        sub_file: Optional[TextIO] = (
+            open(args.write_subtitles, "w", encoding="utf-8")
+            if args.write_subtitles is not None and args.write_subtitles != "-"
+            else None
+        )
+        if sub_file is None and args.write_subtitles == "-":
+            sub_file = sys.stderr
+        async for chunk in communicate.stream():
+            if chunk["type"] == "audio":
+                audio_file.write(chunk["data"])
+            elif chunk["type"] == "WordBoundary":
+                submaker.feed(chunk)
+        if args.words_in_cue > 0:
+            submaker.merge_cues(args.words_in_cue)
+        if sub_file is not None:
+            sub_file.write(submaker.get_srt())
+    finally:
+        if audio_file is not sys.stdout.buffer:
+            audio_file.close()
+        if sub_file is not None and sub_file is not sys.stderr:
+            sub_file.close()
+async def amain() -> None:
+    """Async main function"""
+    parser = argparse.ArgumentParser(
+        description="Text-to-speech using Microsoft Edge's online TTS service."
+    )
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("-t", "--text", help="what TTS will say")
+    group.add_argument("-f", "--file", help="same as --text but read from file")
+    parser.add_argument(
+        "-v",
+        "--voice",
+        help=f"voice for TTS. Default: {DEFAULT_VOICE}",
+        default=DEFAULT_VOICE,
+    )
+    group.add_argument(
+        "-l",
+        "--list-voices",
+        help="lists available voices and exits",
+        action="store_true",
+    )
+    parser.add_argument("--rate", help="set TTS rate. Default +0%%.", default="+0%")
+    parser.add_argument("--volume", help="set TTS volume. Default +0%%.", default="+0%")
+    parser.add_argument("--pitch", help="set TTS pitch. Default +0Hz.", default="+0Hz")
+    parser.add_argument(
+        "--words-in-cue",
+        help="number of words in a subtitle cue. Default: 10.",
+        default=10,
+        type=int,
+    )
+    parser.add_argument(
+        "--write-media", help="send media output to file instead of stdout"
+    )
+    parser.add_argument(
+        "--write-subtitles",
+        help="send subtitle output to provided file instead of stderr",
+    )
+    parser.add_argument("--proxy", help="use a proxy for TTS and voice list.")
+    args = parser.parse_args(namespace=UtilArgs())
+    if args.list_voices:
+        await _print_voices(proxy=args.proxy)
+        sys.exit(0)
+    if args.file is not None:
+        if args.file in ("-", "/dev/stdin"):
+            args.text = sys.stdin.read()
+        else:
+            with open(args.file, "r", encoding="utf-8") as file:
+                args.text = file.read()
+    if args.text is not None:
+        await _run_tts(args)
+def main() -> None:
+    """Run the main function using asyncio."""
+    asyncio.run(amain())
+if __name__ == "__main__":
+    main()

edge_tts/version.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""Version information for the edge_tts package."""
+__version__ = "7.0.0"
+__version_info__ = tuple(int(num) for num in __version__.split("."))

edge_tts/voices.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""This module contains functions to list all available voices and a class to find the
+correct voice based on their attributes."""
+import json
+import ssl
+from typing import List, Optional
+import aiohttp
+import certifi
+from typing_extensions import Unpack
+from .constants import SEC_MS_GEC_VERSION, VOICE_HEADERS, VOICE_LIST
+from .drm import DRM
+from .typing import Voice, VoicesManagerFind, VoicesManagerVoice
+async def __list_voices(
+    session: aiohttp.ClientSession, ssl_ctx: ssl.SSLContext, proxy: Optional[str]
+) -> List[Voice]:
+    """
+    Private function that makes the request to the voice list URL and parses the
+    JSON response. This function is used by list_voices() and makes it easier to
+    handle client response errors related to clock skew.
+    Args:
+        session (aiohttp.ClientSession): The aiohttp session to use for the request.
+        ssl_ctx (ssl.SSLContext): The SSL context to use for the request.
+        proxy (Optional[str]): The proxy to use for the request.
+    Returns:
+        List[Voice]: A list of voices and their attributes.
+    """
+    async with session.get(
+        f"{VOICE_LIST}&Sec-MS-GEC={DRM.generate_sec_ms_gec()}"
+        f"&Sec-MS-GEC-Version={SEC_MS_GEC_VERSION}",
+        headers=VOICE_HEADERS,
+        proxy=proxy,
+        ssl=ssl_ctx,
+        raise_for_status=True,
+    ) as url:
+        data: List[Voice] = json.loads(await url.text())
+    for voice in data:
+        # Remove leading and trailing whitespace from categories and personalities.
+        # This has only happened in one case with the zh-CN-YunjianNeural voice
+        # where there was a leading space in one of the categories.
+        voice["VoiceTag"]["ContentCategories"] = [
+            category.strip()  # type: ignore
+            for category in voice["VoiceTag"]["ContentCategories"]
+        ]
+        voice["VoiceTag"]["VoicePersonalities"] = [
+            personality.strip()  # type: ignore
+            for personality in voice["VoiceTag"]["VoicePersonalities"]
+        ]
+    return data
+async def list_voices(
+    *, connector: Optional[aiohttp.BaseConnector] = None, proxy: Optional[str] = None
+) -> List[Voice]:
+    """
+    List all available voices and their attributes.
+    This pulls data from the URL used by Microsoft Edge to return a list of
+    all available voices.
+    Args:
+        connector (Optional[aiohttp.BaseConnector]): The connector to use for the request.
+        proxy (Optional[str]): The proxy to use for the request.
+    Returns:
+        List[Voice]: A list of voices and their attributes.
+    """
+    ssl_ctx = ssl.create_default_context(cafile=certifi.where())
+    async with aiohttp.ClientSession(connector=connector, trust_env=True) as session:
+        try:
+            data = await __list_voices(session, ssl_ctx, proxy)
+        except aiohttp.ClientResponseError as e:
+            if e.status != 403:
+                raise
+            DRM.handle_client_response_error(e)
+            data = await __list_voices(session, ssl_ctx, proxy)
+    return data
+class VoicesManager:
+    """
+    A class to find the correct voice based on their attributes.
+    """
+    def __init__(self) -> None:
+        self.voices: List[VoicesManagerVoice] = []
+        self.called_create: bool = False
+    @classmethod
+    async def create(
+        cls, custom_voices: Optional[List[Voice]] = None
+    ) -> "VoicesManager":
+        """
+        Creates a VoicesManager object and populates it with all available voices.
+        """
+        self = VoicesManager()
+        voices = await list_voices() if custom_voices is None else custom_voices
+        self.voices = [
+            {**voice, "Language": voice["Locale"].split("-")[0]} for voice in voices
+        ]
+        self.called_create = True
+        return self
+    def find(self, **kwargs: Unpack[VoicesManagerFind]) -> List[VoicesManagerVoice]:
+        """
+        Finds all matching voices based on the provided attributes.
+        """
+        if not self.called_create:
+            raise RuntimeError(
+                "VoicesManager.find() called before VoicesManager.create()"
+            )
+        matching_voices = [
+            voice for voice in self.voices if kwargs.items() <= voice.items()
+        ]
+        return matching_voices