puzan789 commited on
Commit
8d1b3ab
·
1 Parent(s): 4e31ab5
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a base image
2
+ FROM python:3.11-slim
3
+
4
+ # Set environment variables
5
+ ENV PYTHONUNBUFFERED=1
6
+
7
+ # Create and set the working directory
8
+ WORKDIR /app
9
+
10
+ # Copy the requirements.txt file into the container at /app
11
+ COPY requirements.txt /app/
12
+
13
+ # Install any needed dependencies
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ # Copy the current directory contents into the container at /app
17
+ COPY . /app/
18
+
19
+ # Expose the port FastAPI will run on
20
+ EXPOSE 7860
21
+
22
+ # Run the application using uvicorn
23
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, WebSocket,WebSocketDisconnect
2
+ import asyncio
3
+ import base64
4
+ from src.core.speechtotext import SpeechToText
5
+ from src.core.texttospeech import TextToSpeech
6
+ from src.core.texttotext import ConversationHandler
7
+ import os
8
+ app = FastAPI()
9
+ spt = SpeechToText()
10
+ ttt = ConversationHandler()
11
+ tts = TextToSpeech()
12
+ @app.websocket("/ws/voicechat")
13
+ async def websocket_endpoint(websocket: WebSocket):
14
+ await websocket.accept()
15
+ print("User connected.")
16
+ audio_buffer = bytearray()
17
+
18
+ try:
19
+ while True:
20
+ try:
21
+ audio_data = await asyncio.wait_for(websocket.receive_bytes(), timeout=3.0)
22
+ print(f"Received {len(audio_data)} bytes")
23
+ audio_buffer.extend(audio_data)
24
+
25
+ except asyncio.TimeoutError:
26
+ if len(audio_buffer) > 0:
27
+ print("Silence detected. Processing speech...")
28
+ transcript = await spt.trancribe_audio(audio_buffer)
29
+ audio_buffer.clear()
30
+
31
+ if transcript:
32
+ print(f"User said: {transcript}")
33
+ response = await ttt.handle_conversation(transcript)
34
+ if response:
35
+ print(f"AI Response: {response}")
36
+ audio = await tts.synthesize(response)
37
+ audio_base64 = base64.b64encode(audio).decode("utf-8")
38
+ await websocket.send_json({
39
+ "transcript": transcript,
40
+ "response": response,
41
+ "audio": audio_base64,
42
+ "status": "complete"
43
+ })
44
+ await websocket.receive_text()
45
+
46
+ except Exception as e:
47
+ print(f"Error: {e}")
48
+ except WebSocketDisconnect:
49
+ print("User disconnected.")
50
+ if __name__ == '__main__':
51
+ import uvicorn
52
+ uvicorn.run(app,port=7860,host= "0.0.0.0",
53
+ timeout_keep_alive=300, timeout_graceful_shutdown=600)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ elevenlabs
2
+ groq
3
+ python-dotenv
4
+ Requests
5
+ fastapi
6
+ websockets
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (142 Bytes). View file
 
src/core/__init__.py ADDED
File without changes
src/core/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (147 Bytes). View file
 
src/core/__pycache__/speechtotext.cpython-312.pyc ADDED
Binary file (1.46 kB). View file
 
src/core/__pycache__/texttospeech.cpython-312.pyc ADDED
Binary file (3.87 kB). View file
 
src/core/__pycache__/texttotext.cpython-312.pyc ADDED
Binary file (2.54 kB). View file
 
src/core/speechtotext.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wave
2
+ import io
3
+ from groq import Groq
4
+
5
+
6
+ class SpeechToText:
7
+ def __init__(self):
8
+ self.client=Groq()
9
+
10
+ async def trancribe_audio(self,audio_bytes:bytes):
11
+ wav_buffer=io.BytesIO(audio_bytes)
12
+ print("i am here")
13
+ try :
14
+ transcription = self.client.audio.transcriptions.create(
15
+ file=("audio.wav", wav_buffer),
16
+ model="whisper-large-v3-turbo"
17
+ )
18
+ print(f"the text is {transcription.text}")
19
+
20
+ return transcription.text
21
+ except Exception as e:
22
+ print(f"Error transcribing audio: {e}")
23
+ return None
24
+
src/core/texttospeech.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from elevenlabs import ElevenLabs,Voice,VoiceSettings,play
2
+ import os
3
+
4
+ class TextToSpeech:
5
+ def __init__(self):
6
+ self.client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
7
+
8
+ async def synthesize(self,text:str):
9
+ if not text.strip():
10
+ raise ValueError("Input text cannot be empty")
11
+ if len(text)>5000:
12
+ raise ValueError("Input text cannot exceed 5000 characters")
13
+ try:
14
+
15
+ audio_generator =self.client.generate(
16
+ text=text,
17
+ voice=Voice(
18
+ voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
19
+ settings=VoiceSettings(stability=0.5, similarity_boost=0.5),
20
+ ),
21
+ model=os.getenv("TTS_MODEL_NAME"),
22
+
23
+ )
24
+ audio_bytes = b"".join(audio_generator)
25
+ return audio_bytes
26
+ except Exception as e:
27
+ print(f"Error synthesizing text: {str(e)}")
28
+ return None
29
+
30
+
31
+
32
+ if __name__ == "__main__":
33
+ import asyncio
34
+ async def main():
35
+ tts = TextToSpeech()
36
+ audio_bytes = await tts.synthesize('''इपिङ सफ्टवेयरले गुगलको ट्रान्सलिटरेसन सेवा प्रयोग गर्दछ। यसले छिटो र सही टाइपिङ प्रदान गर्दछ, जसले वेबमा नेपाली भाषा टाइप गर्न सजिलो बनाउँछ। तपाईंले अंग्रेजीमा शब्द टाइप गरी स्पेसबार थिचेपछि, उक्त शब्द नेपालीमा रूपान्तरण हुनेछ। तपाईंले ब्याकस्पेस थिचेर वा चयन गरिएको शब्दमा क्लिक गरेर थप विकल्पहरू पनि प्राप्त गर्न सक्नुहुन्छ। यो प्रक्रिया छिटो छ र असीमित क्यारेक्टरहरू र शब्दहरू रूपान्तरण गर्न सक्षम छ। अझै, स्पेसबार थिचेपछि, पाठ तपाईंको कम्प्युटरमा स्वचालित रूपमा सुरक्षित हुनेछ, जसले ब्राउजर क्र्यास भएमा वा पछि पुन: आगमन गर्दा पहिलेको रूपान्तरण गरिएको पाठ पुन: प्राप्त गर्न मद्दत गर्दछ। ''')
37
+ play(audio_bytes)
38
+
39
+
40
+ asyncio.run(main())
41
+
42
+
src/core/texttotext.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from groq import AsyncGroq
2
+
3
+
4
+
5
+ class ConversationHandler:
6
+ def __init__(self):
7
+ self.client = AsyncGroq()
8
+
9
+ async def handle_conversation(self, transcription):
10
+ messages = [
11
+ {
12
+ "role": "system",
13
+ "content":'''You are a friendly and engaging virtual assistant named Callme, designed to assist calling agents in creating pleasant and effective phone interactions. Your persona is warm, approachable, and always ready to help, making every caller feel valued.
14
+ Your task is to respond to incoming calls with a sweet and succinct greeting that sets a positive tone for the conversation.
15
+ Here are some details to keep in mind:
16
+ The response should be brief, ideally no longer than a couple of sentences.
17
+ Make sure to convey enthusiasm and willingness to assist.
18
+ '''
19
+ },
20
+ {
21
+ "role": "user",
22
+ "content": transcription,
23
+ }
24
+ ]
25
+ completion = await self.client.chat.completions.create(
26
+ messages=messages,
27
+ model="llama-3.3-70b-versatile",
28
+ temperature=0.5,
29
+ max_tokens=125,
30
+ top_p=1,
31
+ n=1,
32
+ )
33
+ print(completion.choices[0].message.content)
34
+ return completion.choices[0].message.content
35
+
36
+
37
+
38
+ if __name__ == "__main__":
39
+ import asyncio
40
+ async def main():
41
+ handler = ConversationHandler()
42
+ transcription = await handler.handle_conversation("Hi, I need help with a technical issue.")
43
+ print(transcription)
44
+
45
+ asyncio.run(main())