Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,174 +1,72 @@
|
|
1 |
# ruff: noqa: E402
|
2 |
-
|
|
|
3 |
try:
|
4 |
-
import spaces # Must be first
|
5 |
USING_SPACES = True
|
6 |
except ImportError:
|
7 |
USING_SPACES = False
|
8 |
|
9 |
-
# Suppress warnings
|
10 |
import warnings
|
11 |
-
warnings.filterwarnings("ignore",
|
12 |
-
warnings.filterwarnings("ignore",
|
|
|
13 |
|
14 |
-
# Configure device
|
15 |
-
|
16 |
-
|
17 |
-
from torch.cuda import is_available as cuda_is_available
|
18 |
-
device = "cuda" if cuda_is_available() else "cpu"
|
19 |
-
else:
|
20 |
-
device = "cpu"
|
21 |
|
22 |
-
#
|
23 |
import json
|
24 |
import re
|
25 |
import tempfile
|
26 |
import hashlib
|
27 |
import spacy
|
28 |
-
from collections import defaultdict
|
29 |
from pathlib import Path
|
30 |
-
from importlib.resources import files
|
31 |
from pydub import AudioSegment, silence
|
32 |
from spacy.matcher import DependencyMatcher
|
33 |
-
from typing import List, Dict
|
34 |
|
35 |
-
import click
|
36 |
import gradio as gr
|
37 |
import numpy as np
|
38 |
import soundfile as sf
|
39 |
import torchaudio
|
40 |
-
from cached_path import cached_path
|
41 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
42 |
|
43 |
-
#
|
44 |
nlp = spacy.load("en_core_web_trf")
|
45 |
|
|
|
46 |
class DialogueParser:
|
47 |
def __init__(self):
|
48 |
self.matcher = DependencyMatcher(nlp.vocab)
|
49 |
-
|
50 |
-
|
51 |
-
def _add_patterns(self):
|
52 |
-
said_pattern = [
|
53 |
-
{"RIGHT_ID": "verb", "RIGHT_ATTRS": {"POS": "VERB", "LEMMA": {"IN": ["say", "ask", "reply"]}}},
|
54 |
{"LEFT_ID": "verb", "REL_OP": ">", "RIGHT_ID": "speaker", "RIGHT_ATTRS": {"DEP": "nsubj"}}
|
55 |
]
|
56 |
-
self.matcher.add("
|
57 |
-
|
58 |
-
def parse(self, text: str) -> List[Dict]:
|
59 |
-
doc = nlp(text)
|
60 |
-
matches = self.matcher(doc)
|
61 |
-
dialogues = []
|
62 |
-
|
63 |
-
for match_id, token_ids in matches:
|
64 |
-
if nlp.vocab.strings[match_id] == "SAID_CONSTRUCTION":
|
65 |
-
speaker = doc[token_ids[1]].text
|
66 |
-
quote = self._extract_quote(doc[token_ids[0]])
|
67 |
-
dialogues.append({"speaker": speaker, "text": quote})
|
68 |
-
|
69 |
-
return dialogues
|
70 |
-
|
71 |
-
def _extract_quote(self, verb_token):
|
72 |
-
return " ".join([t.text for t in verb_token.children if t.dep_ == "ccomp"])
|
73 |
-
|
74 |
-
# ========== FIXED GRADIO UI ========== #
|
75 |
-
class VoiceBank:
|
76 |
-
def __init__(self):
|
77 |
-
self.voices = {}
|
78 |
-
self.speaker_map = {}
|
79 |
-
|
80 |
-
def add_voice(self, speaker_name: str, audio_path: str):
|
81 |
-
voice_id = hashlib.md5(Path(audio_path).read_bytes()).hexdigest()
|
82 |
-
self.voices[voice_id] = self._preprocess_audio(audio_path)
|
83 |
-
|
84 |
-
def _preprocess_audio(self, audio_path: str):
|
85 |
-
audio, sr = torchaudio.load(audio_path)
|
86 |
-
if sr != 24000:
|
87 |
-
resampler = torchaudio.transforms.Resample(sr, 24000)
|
88 |
-
audio = resampler(audio)
|
89 |
-
return audio.numpy()
|
90 |
-
|
91 |
-
class AudiobookGenerator:
|
92 |
-
def __init__(self, voice_bank: VoiceBank):
|
93 |
-
self.voice_bank = voice_bank
|
94 |
-
self.sample_rate = 24000
|
95 |
-
|
96 |
-
def generate_audiobook(self, text: str, progress=gr.Progress()):
|
97 |
-
parser = DialogueParser()
|
98 |
-
dialogues = parser.parse(text)
|
99 |
-
audio_segments = []
|
100 |
-
|
101 |
-
for dialogue in progress.tqdm(dialogues):
|
102 |
-
audio = self._generate_segment(dialogue["text"])
|
103 |
-
audio_segments.append(audio)
|
104 |
-
|
105 |
-
return (self.sample_rate, np.concatenate(audio_segments))
|
106 |
-
|
107 |
-
def _generate_segment(self, text: str):
|
108 |
-
# Implement your TTS logic here
|
109 |
-
return np.random.rand(16000) # Placeholder
|
110 |
|
111 |
-
|
112 |
-
|
113 |
|
114 |
-
#
|
115 |
-
with gr.Blocks() as
|
116 |
gr.Markdown("# Audiobook Generator")
|
117 |
|
118 |
with gr.Row():
|
119 |
-
txt_input = gr.File(label="Upload Text
|
120 |
-
|
121 |
-
|
122 |
-
detected_speakers = gr.State()
|
123 |
-
with gr.Column(visible=False) as speaker_ui:
|
124 |
-
gr.Markdown("## Assign Voices to Speakers")
|
125 |
-
speaker_container = gr.Row()
|
126 |
|
127 |
-
|
128 |
-
audio_output = gr.Audio(label="Generated Audiobook", autoplay=True)
|
129 |
|
130 |
-
def
|
131 |
-
|
132 |
-
text = f.read()
|
133 |
-
|
134 |
-
parser = DialogueParser()
|
135 |
-
dialogues = parser.parse(text)
|
136 |
-
speakers = list({d["speaker"] for d in dialogues if d["speaker"] != "Narrator"})
|
137 |
-
|
138 |
-
components = []
|
139 |
-
for speaker in speakers:
|
140 |
-
components.append(gr.Audio(label=f"Voice for {speaker}", type="filepath"))
|
141 |
-
|
142 |
-
return [
|
143 |
-
gr.update(visible=True), # speaker_ui visibility
|
144 |
-
gr.Row.update(components=components), # Update speaker_container
|
145 |
-
gr.update(visible=True), # generate_btn visibility
|
146 |
-
speakers # Store in detected_speakers state
|
147 |
-
]
|
148 |
|
149 |
-
analyze_btn.click(
|
150 |
-
analyze_text,
|
151 |
-
inputs=txt_input,
|
152 |
-
outputs=[speaker_ui, speaker_container, generate_btn, detected_speakers]
|
153 |
-
)
|
154 |
-
|
155 |
generate_btn.click(
|
156 |
-
|
157 |
inputs=txt_input,
|
158 |
outputs=audio_output
|
159 |
)
|
160 |
-
# ========== APP LAUNCH ========== #
|
161 |
-
with gr.Blocks() as app:
|
162 |
-
gr.TabbedInterface(
|
163 |
-
[app_audiobook],
|
164 |
-
["Audiobook"]
|
165 |
-
)
|
166 |
-
|
167 |
-
@click.command()
|
168 |
-
@click.option("--port", "-p", default=7860)
|
169 |
-
@click.option("--host", "-H", default="0.0.0.0")
|
170 |
-
def main(port, host):
|
171 |
-
app.queue().launch(server_port=port, server_name=host)
|
172 |
|
|
|
173 |
if __name__ == "__main__":
|
174 |
-
|
|
|
1 |
# ruff: noqa: E402
|
2 |
+
|
3 |
+
# 1. Force spaces to initialize first
|
4 |
try:
|
5 |
+
import spaces # Must be first!
|
6 |
USING_SPACES = True
|
7 |
except ImportError:
|
8 |
USING_SPACES = False
|
9 |
|
10 |
+
# 2. Suppress all non-critical warnings
|
11 |
import warnings
|
12 |
+
warnings.filterwarnings("ignore", category=UserWarning)
|
13 |
+
warnings.filterwarnings("ignore", message=".*GLIBCXX.*")
|
14 |
+
warnings.filterwarnings("ignore", module="thinc")
|
15 |
|
16 |
+
# 3. Configure device before any other imports
|
17 |
+
import os
|
18 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "" # Force CPU-only
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
# 4. Import remaining packages
|
21 |
import json
|
22 |
import re
|
23 |
import tempfile
|
24 |
import hashlib
|
25 |
import spacy
|
|
|
26 |
from pathlib import Path
|
|
|
27 |
from pydub import AudioSegment, silence
|
28 |
from spacy.matcher import DependencyMatcher
|
|
|
29 |
|
|
|
30 |
import gradio as gr
|
31 |
import numpy as np
|
32 |
import soundfile as sf
|
33 |
import torchaudio
|
|
|
|
|
34 |
|
35 |
+
# 5. Load spaCy model
|
36 |
nlp = spacy.load("en_core_web_trf")
|
37 |
|
38 |
+
# 6. Simplified Dialogue Parser
|
39 |
class DialogueParser:
|
40 |
def __init__(self):
|
41 |
self.matcher = DependencyMatcher(nlp.vocab)
|
42 |
+
pattern = [
|
43 |
+
{"RIGHT_ID": "verb", "RIGHT_ATTRS": {"POS": "VERB", "LEMMA": {"IN": ["say", "ask", "reply"]}},
|
|
|
|
|
|
|
44 |
{"LEFT_ID": "verb", "REL_OP": ">", "RIGHT_ID": "speaker", "RIGHT_ATTRS": {"DEP": "nsubj"}}
|
45 |
]
|
46 |
+
self.matcher.add("DIALOGUE", [pattern])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
+
def parse(self, text):
|
49 |
+
return [{"speaker": "Narrator", "text": text}] # Simplified for demo
|
50 |
|
51 |
+
# 7. Gradio UI
|
52 |
+
with gr.Blocks() as app:
|
53 |
gr.Markdown("# Audiobook Generator")
|
54 |
|
55 |
with gr.Row():
|
56 |
+
txt_input = gr.File(label="Upload Text", file_types=[".txt"])
|
57 |
+
generate_btn = gr.Button("Generate", variant="primary")
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
+
audio_output = gr.Audio(label="Result")
|
|
|
60 |
|
61 |
+
def generate(file):
|
62 |
+
return (24000, np.random.rand(16000)) # Replace with TTS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
generate_btn.click(
|
65 |
+
generate,
|
66 |
inputs=txt_input,
|
67 |
outputs=audio_output
|
68 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
+
# 8. Hugging Face Spaces entry point
|
71 |
if __name__ == "__main__":
|
72 |
+
app.launch(server_name="0.0.0.0", server_port=7860)
|