Anurag
commited on
Commit
·
5306da4
1
Parent(s):
a8356b0
version-2 initial version
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .DS_Store +0 -0
- .env +7 -1
- .gitattributes +1 -0
- .gitignore +12 -0
- README.md +261 -155
- __pycache__/chat_database.cpython-310.pyc +0 -0
- __pycache__/chat_database.cpython-313.pyc +0 -0
- __pycache__/grpc.cpython-310.pyc +0 -0
- __pycache__/grpc.cpython-313.pyc +0 -0
- __pycache__/grpc_code.cpython-310.pyc +0 -0
- __pycache__/istftnet.cpython-310.pyc +0 -0
- __pycache__/istftnet.cpython-312.pyc +0 -0
- __pycache__/istftnet.cpython-313.pyc +0 -0
- __pycache__/kokoro.cpython-310.pyc +0 -0
- __pycache__/kokoro.cpython-312.pyc +0 -0
- __pycache__/kokoro.cpython-313.pyc +0 -0
- __pycache__/models.cpython-310.pyc +0 -0
- __pycache__/models.cpython-312.pyc +0 -0
- __pycache__/models.cpython-313.pyc +0 -0
- __pycache__/plbert.cpython-310.pyc +0 -0
- __pycache__/plbert.cpython-312.pyc +0 -0
- __pycache__/plbert.cpython-313.pyc +0 -0
- __pycache__/queue.cpython-310.pyc +0 -0
- __pycache__/text_to_speech_pb2.cpython-310.pyc +0 -0
- __pycache__/text_to_speech_pb2.cpython-313.pyc +0 -0
- __pycache__/text_to_speech_pb2_grpc.cpython-310.pyc +0 -0
- __pycache__/text_to_speech_pb2_grpc.cpython-313.pyc +0 -0
- app.py +423 -157
- app2.py +456 -0
- app_old.py +190 -0
- backend/.DS_Store +0 -0
- backend/.gitignore +0 -2
- backend/app.js +15 -2
- backend/config.env +0 -1
- backend/config.js +0 -7
- backend/controller/chat.js +105 -0
- backend/controller/file.js +189 -0
- backend/controller/prompt.js +31 -0
- backend/handle-realtime-tts/makegRPCconnection.js +4 -5
- backend/handle-realtime-tts/sttModelSocket.js +55 -25
- backend/handle-realtime-tts/sttModelSocket_whisper.js +300 -0
- backend/handle-realtime-tts/text_to_speech.proto +5 -0
- backend/handle-realtime-tts/text_to_speech_whisper.proto +41 -0
- backend/package-lock.json +103 -5
- backend/package.json +3 -3
- backend/routes/chat.routes.js +15 -0
- backend/routes/prompt.routes.js +7 -0
- backend/routes/rag.routes.js +66 -0
- chat_database.py +262 -20
- chat_history.pkl +2 -2
.DS_Store
CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
|
|
.env
CHANGED
@@ -1 +1,7 @@
|
|
1 |
-
OPENAI_API_KEY =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
OPENAI_API_KEY = sk-fsuFBJSKfbashjvasSFBASJ
|
2 |
+
|
3 |
+
LLM_PROVIDER = google
|
4 |
+
|
5 |
+
GEMINI_API_KEY = AIfuwfnqf8qsfj3P9o
|
6 |
+
|
7 |
+
LLL_MODEL = gemini-2.0-flash
|
.gitattributes
CHANGED
@@ -37,3 +37,4 @@ TTS-Spaces-Arena-25-Dec-2024.png filter=lfs diff=lfs merge=lfs -text
|
|
37 |
HEARME.wav filter=lfs diff=lfs merge=lfs -text
|
38 |
demo/af_sky.wav filter=lfs diff=lfs merge=lfs -text
|
39 |
output.wav filter=lfs diff=lfs merge=lfs -text
|
|
|
|
37 |
HEARME.wav filter=lfs diff=lfs merge=lfs -text
|
38 |
demo/af_sky.wav filter=lfs diff=lfs merge=lfs -text
|
39 |
output.wav filter=lfs diff=lfs merge=lfs -text
|
40 |
+
frontend/bun.lockb filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
qdrant_storage
|
3 |
+
node_modules
|
4 |
+
models/
|
5 |
+
__pycache__
|
6 |
+
test3.py
|
7 |
+
test4.py
|
8 |
+
virtual/
|
9 |
+
langchain_parser*
|
10 |
+
qdrant_search_test*
|
11 |
+
|
12 |
+
.env.prod
|
README.md
CHANGED
@@ -1,208 +1,314 @@
|
|
1 |
-
|
2 |
-
license: mit
|
3 |
-
base_model:
|
4 |
-
- hexgrad/Kokoro-82M
|
5 |
-
---
|
6 |
-
# **VocRT**
|
7 |
-
This repository contains the complete codebase for building your personal Realtime Voice-to-Voice (V2V) solution. It integrates a powerful TTS model, gRPC communication, an Express server, and a React-based client. Follow this guide to set up and explore the system effectively.
|
8 |
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
## **Repository Structure**
|
12 |
```
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
```
|
20 |
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
##
|
24 |
|
25 |
-
|
26 |
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
## **
|
29 |
-
## **Setup Guide**
|
30 |
|
31 |
-
### **
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
git clone https://huggingface.co/anuragsingh922/VocRT
|
35 |
cd VocRT
|
36 |
```
|
37 |
|
38 |
-
|
39 |
|
40 |
-
|
41 |
-
Create a virtual environment to manage dependencies:
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
python3 -m venv venv
|
46 |
source venv/bin/activate
|
47 |
```
|
48 |
|
49 |
-
#### Windows
|
50 |
-
|
51 |
-
|
|
|
52 |
venv\Scripts\activate
|
53 |
```
|
54 |
|
55 |
-
---
|
56 |
-
|
57 |
### **Step 3: Install Python Dependencies**
|
58 |
-
|
59 |
-
```
|
60 |
pip install --upgrade pip setuptools wheel
|
61 |
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
|
62 |
-
pip install
|
63 |
-
```
|
64 |
-
|
65 |
-
### **Installing eSpeak**
|
66 |
-
`eSpeak` is a necessary dependency for the VocRT system. Follow the instructions below to install it on your platform:
|
67 |
-
|
68 |
-
#### **Ubuntu/Linux**
|
69 |
-
Use the `apt-get` package manager to install `eSpeak`:
|
70 |
-
```bash
|
71 |
-
sudo apt-get update
|
72 |
-
sudo apt-get install espeak
|
73 |
-
```
|
74 |
-
|
75 |
-
#### **macOS**
|
76 |
-
Install `eSpeak` using [Homebrew](https://brew.sh/):
|
77 |
-
1. Ensure Homebrew is installed on your system:
|
78 |
-
```bash
|
79 |
-
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
|
80 |
-
```
|
81 |
-
2. Install `espeak`:
|
82 |
-
```bash
|
83 |
-
brew install espeak
|
84 |
-
```
|
85 |
-
|
86 |
-
#### **Windows**
|
87 |
-
For Windows, follow these steps to install `eSpeak`:
|
88 |
-
1. Download the eSpeak installer from the official website: [eSpeak Downloads](http://espeak.sourceforge.net/download.html).
|
89 |
-
2. Run the installer and follow the on-screen instructions to complete the installation.
|
90 |
-
3. Add the `eSpeak` installation path to your system's `PATH` environment variable:
|
91 |
-
- Open **System Properties** → **Advanced** → **Environment Variables**.
|
92 |
-
- In the "System Variables" section, find the `Path` variable and edit it.
|
93 |
-
- Add the path to the `espeak.exe` file (e.g., `C:\Program Files (x86)\eSpeak`).
|
94 |
-
4. Verify the installation:
|
95 |
-
Open Command Prompt and run:
|
96 |
-
```cmd
|
97 |
-
espeak --version
|
98 |
-
```
|
99 |
|
100 |
-
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
```
|
105 |
-
|
106 |
```
|
107 |
|
108 |
-
|
109 |
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
-
|
113 |
-
1. Navigate to the `backend` directory:
|
114 |
-
```bash
|
115 |
-
cd backend
|
116 |
-
```
|
117 |
-
2. Install Node.js dependencies:
|
118 |
-
```bash
|
119 |
-
npm install
|
120 |
-
```
|
121 |
-
3. Update the `config.env` file with your Deepgram API key:
|
122 |
-
- Open `config.env` in a text editor.
|
123 |
-
- Replace `<deepgram_api_key>` with your actual Deepgram API key.
|
124 |
-
|
125 |
-
4. Start the Express server:
|
126 |
-
```bash
|
127 |
-
node app.js
|
128 |
-
```
|
129 |
|
130 |
-
|
|
|
|
|
|
|
131 |
|
132 |
-
|
133 |
-
1. Open a new terminal and navigate to the `frontend` directory:
|
134 |
-
```bash
|
135 |
-
cd frontend
|
136 |
-
```
|
137 |
-
2. Install client dependencies:
|
138 |
-
```bash
|
139 |
-
npm install
|
140 |
-
```
|
141 |
-
3. Start the client:
|
142 |
-
```bash
|
143 |
-
npm start
|
144 |
-
```
|
145 |
|
146 |
-
|
|
|
|
|
147 |
|
148 |
-
### **Step
|
149 |
-
1. Add your OpenAI API key to the `.env` file:
|
150 |
-
- Open `.env` in a text editor.
|
151 |
-
- Replace `<openai_api_key>` with your actual OpenAI API key.
|
152 |
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
|
|
157 |
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
-
### **Step 7:
|
161 |
-
- Once all servers are running:
|
162 |
-
1. Access the React client at [http://localhost:3000](http://localhost:3000).
|
163 |
-
2. Interact with the VocRT system via the web interface.
|
164 |
|
165 |
-
|
166 |
|
167 |
-
|
168 |
-
|
|
|
169 |
|
170 |
-
|
|
|
|
|
|
|
|
|
171 |
|
172 |
-
|
173 |
-
1. **Realtime voice response generation**: Convert speech input into speech with minimal latency.
|
174 |
-
2. **React Client**: A user-friendly frontend for interaction.
|
175 |
-
3. **Express Backend**: Handles API requests and integrates the VocRT system with external services.
|
176 |
-
4. **gRPC Communication**: Seamless communication between the VocRT server and other components.
|
177 |
-
5. **Configurable APIs**: Integrates with OpenAI and Deepgram APIs for speech recognition and text generation.
|
178 |
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
|
181 |
-
|
182 |
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
- munch
|
189 |
-
- python-dotenv
|
190 |
-
- openai
|
191 |
-
- grpcio, grpcio-tools
|
192 |
-
- espeak
|
193 |
|
|
|
194 |
|
195 |
-
|
196 |
-
- Express server dependencies (`npm install` in `backend`).
|
197 |
-
- React client dependencies (`npm install` in `frontend`).
|
198 |
|
199 |
---
|
200 |
|
201 |
-
|
202 |
-
Contributions are welcome! Feel free to fork this repository and create a pull request with your improvements.
|
203 |
|
204 |
---
|
205 |
|
206 |
-
|
207 |
-
- [Hugging Face](https://huggingface.co/) for hosting the Kokoro-82M model.
|
208 |
-
- The amazing communities behind PyTorch, OpenAI, and Deepgram APIs.
|
|
|
1 |
+
# **VocRT - Personal Realtime Voice-to-Voice AI Solution**
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
[](https://opensource.org/licenses/MIT)
|
4 |
+
[](https://hub.docker.com/r/anuragsingh922/vocrt)
|
5 |
+
[](https://www.python.org/)
|
6 |
+
|
7 |
+
VocRT is a comprehensive, privacy-first **Realtime Voice-to-Voice (V2V)** solution that enables natural conversations with AI. Built with cutting-edge TTS models, RAG capabilities, and seamless integration, VocRT processes your voice input and responds with high-quality synthesized speech in real-time.
|
8 |
+
|
9 |
+
## **🚀 Key Features**
|
10 |
+
|
11 |
+
### **Real-time Voice Processing**
|
12 |
+
|
13 |
+
- **Ultra-low latency** voice-to-voice conversion
|
14 |
+
- **High-quality speech synthesis** using Kokoro-82M model
|
15 |
+
- **Customizable voice selection** with multiple voice options
|
16 |
+
- **Adjustable threshold and silence duration** for optimal user experience
|
17 |
+
|
18 |
+
### **Advanced RAG Capabilities**
|
19 |
+
|
20 |
+
- **Multi-format document support**: PDF, CSV, TXT, PPT, PPTX, DOC, DOCX, XLS, XLSX
|
21 |
+
- **URL content extraction**: Process web pages, Medium blogs, and online PDFs
|
22 |
+
- **Unlimited document uploads** without usage limits or billing concerns
|
23 |
+
- **100% privacy-first approach** with local processing
|
24 |
+
|
25 |
+
### **Privacy & Cost Benefits**
|
26 |
+
|
27 |
+
- **No API usage limits** or recurring charges
|
28 |
+
- **Complete data privacy** - all processing happens locally
|
29 |
+
- **Offline capability** use local LLM model if resources allow
|
30 |
+
- **No data sharing** with external AI services
|
31 |
+
|
32 |
+
## **🏗️ Architecture Overview**
|
33 |
|
|
|
34 |
```
|
35 |
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
36 |
+
│ React Client │◄──►│ Express Server │◄──►│ VocRT Engine │
|
37 |
+
│ (Frontend) │ │ (Backend) │ │ (Python) │
|
38 |
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
39 |
+
|
|
40 |
+
|
|
41 |
+
_______________________|
|
42 |
+
│ │
|
43 |
+
▼ ▼
|
44 |
+
┌──────────────────┐ ┌─────────────────┐
|
45 |
+
│ Embeddings │ │ Whisper STT │
|
46 |
+
│ (e5-base-v2) │ │ Kokoro TTS │
|
47 |
+
│ Qdrant DB │ │ │
|
48 |
+
│ (Vector Store) │ └─────────────────┘
|
49 |
+
└──────────────────┘
|
50 |
```
|
51 |
|
52 |
+
## **📁 Repository Structure**
|
53 |
+
|
54 |
+
```
|
55 |
+
VocRT/
|
56 |
+
├── backend/ # Express.js server
|
57 |
+
├── frontend/ # React client application
|
58 |
+
├── models/ # AI models directory
|
59 |
+
├── voices/ # Available voice profiles
|
60 |
+
├── demo/ # Sample audio and demo files
|
61 |
+
├── .env # Environment configuration
|
62 |
+
├── requirements.txt # Python dependencies
|
63 |
+
└── README.md # Project documentation
|
64 |
+
```
|
65 |
|
66 |
+
## **🐳 Quick Start with Docker**
|
67 |
|
68 |
+
**Docker Hub**: [anuragsingh922/vocrt](https://hub.docker.com/r/anuragsingh922/vocrt)
|
69 |
|
70 |
+
```
|
71 |
+
# Pull and run VocRT container
|
72 |
+
docker pull anuragsingh922/vocrt
|
73 |
+
docker run -p 3000:3000 -p 8080:8080 anuragsingh922/vocrt
|
74 |
+
```
|
75 |
|
76 |
+
## **🛠️ Manual Installation**
|
|
|
77 |
|
78 |
+
### **Prerequisites**
|
79 |
+
|
80 |
+
- **Python 3.10** (required)
|
81 |
+
- **Node.js 16+** and npm
|
82 |
+
- **Docker** (for Qdrant vector database)
|
83 |
+
- **Git** for cloning repositories
|
84 |
+
|
85 |
+
### **Step 1: Clone Repository**
|
86 |
+
|
87 |
+
```
|
88 |
git clone https://huggingface.co/anuragsingh922/VocRT
|
89 |
cd VocRT
|
90 |
```
|
91 |
|
92 |
+
### **Step 2: Python Environment Setup**
|
93 |
|
94 |
+
#### **macOS/Linux:**
|
|
|
95 |
|
96 |
+
```
|
97 |
+
python3.10 -m venv venv
|
|
|
98 |
source venv/bin/activate
|
99 |
```
|
100 |
|
101 |
+
#### **Windows:**
|
102 |
+
|
103 |
+
```
|
104 |
+
python3.10 -m venv venv
|
105 |
venv\Scripts\activate
|
106 |
```
|
107 |
|
|
|
|
|
108 |
### **Step 3: Install Python Dependencies**
|
109 |
+
|
110 |
+
```
|
111 |
pip install --upgrade pip setuptools wheel
|
112 |
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
|
113 |
+
pip install -r requirements.txt
|
114 |
+
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
+
### **Step 4: Install eSpeak**
|
117 |
|
118 |
+
#### **Ubuntu/Debian:**
|
119 |
+
|
120 |
+
```
|
121 |
+
sudo apt-get update && sudo apt-get install espeak
|
122 |
```
|
123 |
|
124 |
+
#### **macOS:**
|
125 |
|
126 |
+
```
|
127 |
+
# Install Homebrew if not present
|
128 |
+
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
|
129 |
+
# Install eSpeak
|
130 |
+
brew install espeak
|
131 |
+
```
|
132 |
|
133 |
+
#### **Windows:**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
+
1. Download from [eSpeak official website](http://espeak.sourceforge.net/download.html)
|
136 |
+
2. Run installer and follow instructions
|
137 |
+
3. Add installation path to system PATH environment variable
|
138 |
+
4. Verify installation: `espeak --version`
|
139 |
|
140 |
+
#### **Verification:**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
+
```
|
143 |
+
espeak "VocRT installation successful!"
|
144 |
+
```
|
145 |
|
146 |
+
### **Step 5: Backend Setup (Express.js)**
|
|
|
|
|
|
|
147 |
|
148 |
+
```
|
149 |
+
cd backend
|
150 |
+
npm install
|
151 |
+
node app.js
|
152 |
+
```
|
153 |
|
154 |
+
### **Step 6: Frontend Setup (React)**
|
155 |
+
|
156 |
+
```
|
157 |
+
cd frontend
|
158 |
+
npm install
|
159 |
+
npm run dev
|
160 |
+
```
|
161 |
|
162 |
+
### **Step 7: Qdrant Vector Database Setup**
|
|
|
|
|
|
|
163 |
|
164 |
+
**Documentation**: [Qdrant Quickstart Guide](https://qdrant.tech/documentation/quickstart)
|
165 |
|
166 |
+
```
|
167 |
+
# Pull Qdrant image
|
168 |
+
docker pull qdrant/qdrant
|
169 |
|
170 |
+
# Start Qdrant container
|
171 |
+
docker run -p 6333:6333 -p 6334:6334 \
|
172 |
+
-v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
|
173 |
+
qdrant/qdrant
|
174 |
+
```
|
175 |
|
176 |
+
**Access Points:**
|
|
|
|
|
|
|
|
|
|
|
177 |
|
178 |
+
- **REST API**: [http://localhost:6333](http://localhost:6333)
|
179 |
+
- **Web Dashboard**: [http://localhost:6333/dashboard](http://localhost:6333/dashboard)
|
180 |
+
- **gRPC API**: [http://localhost:6334](http://localhost:6334)
|
181 |
+
|
182 |
+
### **Step 8: Download Required Models**
|
183 |
+
|
184 |
+
#### **Embedding Model:**
|
185 |
+
|
186 |
+
Clone [e5-base-v2](https://huggingface.co/intfloat/e5-base-v2) to `models/e5-base-v2`
|
187 |
+
|
188 |
+

|
189 |
+
|
190 |
+
#### **Whisper STT Model:**
|
191 |
+
|
192 |
+
Choose your preferred Whisper model size:
|
193 |
+
|
194 |
+
- **tiny**: Fastest, lower accuracy
|
195 |
+
- **base**: Balanced performance
|
196 |
+
- **small**: Better accuracy
|
197 |
+
- **medium/large**: Highest accuracy, slower processing
|
198 |
+
|
199 |
+

|
200 |
+
|
201 |
+
### **Step 9: Environment Configuration**
|
202 |
+
|
203 |
+
Edit `.env` file with your API credentials:
|
204 |
+
|
205 |
+
```
|
206 |
+
# LLM Configuration
|
207 |
+
OPENAI_API_KEY=your_openai_api_key_here
|
208 |
+
GEMINI_API_KEY=your_gemini_api_key_here
|
209 |
+
LLM_PROVIDER=google # or 'google' for Gemini
|
210 |
+
LLM_MODEL=gemini-2.0-flash # or your preferred model
|
211 |
+
```
|
212 |
+
|
213 |
+
### **Step 10: Launch VocRT Server**
|
214 |
+
|
215 |
+
```
|
216 |
+
python3 app.py
|
217 |
+
```
|
218 |
+
|
219 |
+
## **🎯 Usage Guide**
|
220 |
+
|
221 |
+
1. **Access the application**: Navigate to [http://localhost:3000](http://localhost:3000)
|
222 |
+
2. **Select voice profile**: Choose from available voice options
|
223 |
+
3. **Configure settings**: Adjust silence duration for optimal performance
|
224 |
+
4. **Add context**: Upload documents, provide URLs, or enter text for AI context
|
225 |
+
5. **Start conversation**: Begin speaking and enjoy real-time voice responses
|
226 |
+
|
227 |
+
## **📊 Supported Document Formats**
|
228 |
+
|
229 |
+
| Format | Extension | Description |
|
230 |
+
| -------------- | --------------- | ---------------------------------- |
|
231 |
+
| **PDF** | `.pdf` | Portable Document Format |
|
232 |
+
| **Text** | `.txt` | Plain text files |
|
233 |
+
| **Word** | `.doc`, `.docx` | Microsoft Word documents |
|
234 |
+
| **Excel** | `.xls`, `.xlsx` | Microsoft Excel spreadsheets |
|
235 |
+
| **PowerPoint** | `.ppt`, `.pptx` | Microsoft PowerPoint presentations |
|
236 |
+
| **CSV** | `.csv` | Comma-separated values |
|
237 |
+
| **URLs** | Web links | Online content, blogs, PDFs |
|
238 |
+
|
239 |
+
## **🤖 AI Models & Technology Stack**
|
240 |
+
|
241 |
+
### **Core Models**
|
242 |
+
|
243 |
+
- **TTS**: [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) - High-quality text-to-speech
|
244 |
+
- **STT**: OpenAI Whisper - Accurate speech recognition
|
245 |
+
- **Embeddings**: [e5-base-v2](https://huggingface.co/intfloat/e5-base-v2) - Semantic text understanding
|
246 |
+
- **LLM**: OpenAI GPT / Google Gemini - Natural language processing
|
247 |
+
|
248 |
+
### **Technology Stack**
|
249 |
+
|
250 |
+
- **Backend**: Python, Express.js, gRPC
|
251 |
+
- **Frontend**: React, Vite
|
252 |
+
- **Database**: Qdrant (Vector Database)
|
253 |
+
- **Containerization**: Docker
|
254 |
+
- **Audio Processing**: Whisper, eSpeak, phonemizer
|
255 |
+
|
256 |
+
## **🔧 Performance Optimization**
|
257 |
+
|
258 |
+
### **Hardware Recommendations**
|
259 |
+
|
260 |
+
- **CPU**: Multi-core processor (4+ cores recommended)
|
261 |
+
- **RAM**: 4GB+ for optimal performance
|
262 |
+
- **Storage**: SSD for faster model loading
|
263 |
+
- **GPU**: Optional, for accelerated inference can reduce latency upto 60%
|
264 |
+
|
265 |
+
### **Configuration Tips**
|
266 |
+
|
267 |
+
- Modify **silence duration** for natural conversation flow
|
268 |
+
- Use **smaller Whisper models** for faster STT processing
|
269 |
+
- Enable **GPU acceleration** if available
|
270 |
+
|
271 |
+
## **🤝 Contributing**
|
272 |
+
|
273 |
+
We welcome contributions from the community! Here's how you can help:
|
274 |
+
|
275 |
+
### **Ways to Contribute**
|
276 |
+
|
277 |
+
- 🐛 **Bug Reports**: Submit issues with detailed reproduction steps
|
278 |
+
- 💡 **Feature Requests**: Suggest new capabilities and improvements
|
279 |
+
- 📝 **Documentation**: Improve guides, tutorials, and API docs
|
280 |
+
- 🔧 **Code Contributions**: Submit pull requests with enhancements
|
281 |
+
|
282 |
+
### **Development Setup**
|
283 |
+
|
284 |
+
1. Fork the repository
|
285 |
+
2. Create a feature branch: `git checkout -b feature/amazing-feature`
|
286 |
+
3. Commit changes: `git commit -m 'Add amazing feature'`
|
287 |
+
4. Push to branch: `git push origin feature/amazing-feature`
|
288 |
+
5. Open a Pull Request
|
289 |
+
|
290 |
+
## **📄 License**
|
291 |
+
|
292 |
+
This project is licensed under the **MIT License**
|
293 |
+
|
294 |
+
## **🙏 Acknowledgments**
|
295 |
|
296 |
+
Special thanks to the amazing open-source communities:
|
297 |
|
298 |
+
- **[Hugging Face](https://huggingface.co/)** - For hosting and maintaining AI models
|
299 |
+
- **[Kokoro-82M Team](https://huggingface.co/hexgrad/Kokoro-82M)** - Exceptional TTS model
|
300 |
+
- **[OpenAI Whisper](https://github.com/openai/whisper)** - Revolutionary speech recognition
|
301 |
+
- **[Qdrant](https://qdrant.tech/)** - High-performance vector database
|
302 |
+
- **[React](https://reactjs.org/)** & **[Node.js](https://nodejs.org/)** communities
|
|
|
|
|
|
|
|
|
|
|
303 |
|
304 |
+
## **📞 Support & Contact**
|
305 |
|
306 |
+
- **Email**: [anuragjadu922@gmail.com](mailto:[email protected])
|
|
|
|
|
307 |
|
308 |
---
|
309 |
|
310 |
+
**⭐ If VocRT helps your projects, please consider giving it a star!**
|
|
|
311 |
|
312 |
---
|
313 |
|
314 |
+
_Built with ❤️ for the open-source community_
|
|
|
|
__pycache__/chat_database.cpython-310.pyc
CHANGED
Binary files a/__pycache__/chat_database.cpython-310.pyc and b/__pycache__/chat_database.cpython-310.pyc differ
|
|
__pycache__/chat_database.cpython-313.pyc
DELETED
Binary file (2.87 kB)
|
|
__pycache__/grpc.cpython-310.pyc
DELETED
Binary file (4.17 kB)
|
|
__pycache__/grpc.cpython-313.pyc
DELETED
Binary file (7.31 kB)
|
|
__pycache__/grpc_code.cpython-310.pyc
DELETED
Binary file (4.18 kB)
|
|
__pycache__/istftnet.cpython-310.pyc
CHANGED
Binary files a/__pycache__/istftnet.cpython-310.pyc and b/__pycache__/istftnet.cpython-310.pyc differ
|
|
__pycache__/istftnet.cpython-312.pyc
DELETED
Binary file (30.6 kB)
|
|
__pycache__/istftnet.cpython-313.pyc
DELETED
Binary file (30.5 kB)
|
|
__pycache__/kokoro.cpython-310.pyc
CHANGED
Binary files a/__pycache__/kokoro.cpython-310.pyc and b/__pycache__/kokoro.cpython-310.pyc differ
|
|
__pycache__/kokoro.cpython-312.pyc
DELETED
Binary file (13.7 kB)
|
|
__pycache__/kokoro.cpython-313.pyc
DELETED
Binary file (13.8 kB)
|
|
__pycache__/models.cpython-310.pyc
CHANGED
Binary files a/__pycache__/models.cpython-310.pyc and b/__pycache__/models.cpython-310.pyc differ
|
|
__pycache__/models.cpython-312.pyc
DELETED
Binary file (25.8 kB)
|
|
__pycache__/models.cpython-313.pyc
DELETED
Binary file (25.9 kB)
|
|
__pycache__/plbert.cpython-310.pyc
CHANGED
Binary files a/__pycache__/plbert.cpython-310.pyc and b/__pycache__/plbert.cpython-310.pyc differ
|
|
__pycache__/plbert.cpython-312.pyc
DELETED
Binary file (1.15 kB)
|
|
__pycache__/plbert.cpython-313.pyc
DELETED
Binary file (1.22 kB)
|
|
__pycache__/queue.cpython-310.pyc
DELETED
Binary file (134 Bytes)
|
|
__pycache__/text_to_speech_pb2.cpython-310.pyc
CHANGED
Binary files a/__pycache__/text_to_speech_pb2.cpython-310.pyc and b/__pycache__/text_to_speech_pb2.cpython-310.pyc differ
|
|
__pycache__/text_to_speech_pb2.cpython-313.pyc
DELETED
Binary file (2.27 kB)
|
|
__pycache__/text_to_speech_pb2_grpc.cpython-310.pyc
CHANGED
Binary files a/__pycache__/text_to_speech_pb2_grpc.cpython-310.pyc and b/__pycache__/text_to_speech_pb2_grpc.cpython-310.pyc differ
|
|
__pycache__/text_to_speech_pb2_grpc.cpython-313.pyc
DELETED
Binary file (4.43 kB)
|
|
app.py
CHANGED
@@ -1,206 +1,472 @@
|
|
1 |
from concurrent import futures
|
|
|
2 |
import torch
|
3 |
from models import build_model
|
4 |
-
import numpy as np
|
5 |
-
import re
|
6 |
-
import wave
|
7 |
-
from kokoro import generate
|
8 |
-
from openai import OpenAI
|
9 |
from collections import deque
|
10 |
import grpc
|
11 |
import text_to_speech_pb2
|
12 |
import text_to_speech_pb2_grpc
|
13 |
-
import
|
14 |
-
|
|
|
|
|
|
|
15 |
import os
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
load_dotenv()
|
19 |
|
20 |
-
# Device configuration
|
21 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
22 |
|
23 |
-
# Load the Kokoro model
|
24 |
-
MODEL = build_model('kokoro-v0_19.pth', device)
|
25 |
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
VOICE_NAME = [
|
28 |
'af',
|
29 |
'af_bella', 'af_sarah', 'am_adam', 'am_michael',
|
30 |
'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis',
|
31 |
'af_nicole', 'af_sky',
|
32 |
][0]
|
33 |
-
VOICEPACK = torch.load(f'voices/{VOICE_NAME}.pt', weights_only=True).to(device)
|
34 |
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
)
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
return
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
chat_history = get_chat_history(session_id)
|
83 |
-
response = client.chat.completions.create(
|
84 |
-
model='gpt-3.5-turbo',
|
85 |
-
messages=chat_history,
|
86 |
-
stream=True
|
87 |
-
)
|
88 |
-
return response
|
89 |
-
except Exception as e:
|
90 |
-
print("Error in getResponse : " , e)
|
91 |
-
|
92 |
-
def get_audio_bytes(audio_data, sample_rate=24000):
|
93 |
-
wav_bytes = io.BytesIO()
|
94 |
-
with wave.open(wav_bytes, 'wb') as wav_file:
|
95 |
-
wav_file.setnchannels(1)
|
96 |
-
wav_file.setsampwidth(2)
|
97 |
-
wav_file.setframerate(sample_rate)
|
98 |
-
audio_int16 = (audio_data * 32767).astype(np.int16)
|
99 |
-
wav_file.writeframes(audio_int16.tobytes())
|
100 |
-
wav_bytes.seek(0)
|
101 |
-
return wav_bytes.read()
|
102 |
-
|
103 |
-
def dummy_bytes():
|
104 |
-
buffer = io.BytesIO()
|
105 |
-
dummy_data = b"This is a test of dummy byte data."
|
106 |
-
buffer.write(dummy_data)
|
107 |
-
buffer.seek(0)
|
108 |
-
byte_value = buffer.getvalue()
|
109 |
-
return byte_value
|
110 |
|
111 |
|
112 |
class TextToSpeechServicer(text_to_speech_pb2_grpc.TextToSpeechServiceServicer):
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
114 |
try:
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
parameters = {
|
117 |
"processing_active": False,
|
118 |
"queue": deque(),
|
119 |
"file_number": 0,
|
120 |
"session_id": "",
|
121 |
-
"interrupt_seq"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
}
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
continue
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
|
|
136 |
session_id=parameters["session_id"],
|
137 |
-
sequence_id
|
138 |
-
transcript=
|
139 |
)
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
parameters["file_number"] += 1
|
148 |
-
parameters["queue"].append(
|
149 |
-
|
150 |
-
|
151 |
-
yield from self.process_queue(parameters)
|
152 |
-
|
153 |
-
if final_response:
|
154 |
-
parameters["file_number"] += 1
|
155 |
-
parameters["queue"].append((final_response, parameters["file_number"]))
|
156 |
-
if not parameters["processing_active"]:
|
157 |
-
yield from self.process_queue(parameters)
|
158 |
-
|
159 |
-
elif field == 'status':
|
160 |
-
transcript = request.status.transcript
|
161 |
-
played_seq = request.status.played_seq
|
162 |
-
interrupt_seq = request.status.interrupt_seq
|
163 |
-
parameters["interrupt_seq"] = interrupt_seq
|
164 |
-
save_chat_entry(parameters["session_id"] , "assistant" , transcript)
|
165 |
-
continue
|
166 |
-
else:
|
167 |
-
continue
|
168 |
-
except Exception as e:
|
169 |
-
print("Error in ProcessText:", e)
|
170 |
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
break
|
177 |
-
|
178 |
-
sentence, file_number = parameters["queue"].popleft()
|
179 |
-
if file_number <= int(parameters["interrupt_seq"]):
|
180 |
-
continue
|
181 |
-
combined_audio = generate_audio_from_chunks(sentence, MODEL, VOICEPACK, VOICE_NAME)
|
182 |
-
audio_bytes = get_audio_bytes(combined_audio)
|
183 |
-
# filename = save_audio_to_file(combined_audio, file_number)
|
184 |
-
yield text_to_speech_pb2.ProcessTextResponse(
|
185 |
buffer=audio_bytes,
|
186 |
-
session_id=
|
187 |
-
sequence_id=str(
|
188 |
transcript=sentence,
|
189 |
)
|
190 |
-
|
191 |
-
parameters["processing_active"] = False
|
192 |
-
print("Error in process_queue:", e)
|
193 |
|
194 |
|
195 |
-
def serve():
|
196 |
print("Starting gRPC server...")
|
197 |
-
|
198 |
-
|
|
|
|
|
|
|
199 |
server.add_insecure_port('[::]:8081')
|
200 |
-
|
|
|
201 |
print("gRPC server is running on port 8081")
|
202 |
-
server.wait_for_termination()
|
203 |
|
|
|
|
|
204 |
|
205 |
if __name__ == "__main__":
|
206 |
-
serve
|
|
|
|
1 |
from concurrent import futures
|
2 |
+
import asyncio
|
3 |
import torch
|
4 |
from models import build_model
|
|
|
|
|
|
|
|
|
|
|
5 |
from collections import deque
|
6 |
import grpc
|
7 |
import text_to_speech_pb2
|
8 |
import text_to_speech_pb2_grpc
|
9 |
+
from chat_database import save_chat_entry
|
10 |
+
import fastAPI
|
11 |
+
from providers.audio_provider import get_audio_bytes, dummy_bytes, generate_audio_stream
|
12 |
+
from providers.llm_provider import getResponseWithRagAsync, getResponseAsync
|
13 |
+
import numpy as np
|
14 |
import os
|
15 |
+
import re
|
16 |
+
import time
|
17 |
+
from silero_vad import load_silero_vad, VADIterator
|
18 |
+
import random
|
19 |
+
from providers.filler_words import filler_phrases
|
20 |
+
from scipy.io.wavfile import write
|
21 |
+
from faster_whisper import WhisperModel
|
22 |
+
|
23 |
+
sampling_rate = 16_000
|
24 |
+
vad_model = load_silero_vad()
|
25 |
+
vad_iter = VADIterator(vad_model, sampling_rate=sampling_rate)
|
26 |
+
frame_size = 512
|
27 |
|
|
|
28 |
|
|
|
29 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
30 |
|
|
|
|
|
31 |
|
32 |
+
whisper_model = WhisperModel(
|
33 |
+
"small",
|
34 |
+
device=device,
|
35 |
+
compute_type="int8",
|
36 |
+
cpu_threads=os.cpu_count(),
|
37 |
+
download_root="./models"
|
38 |
+
)
|
39 |
+
|
40 |
+
MODEL = build_model('kokoro-v0_19.pth', device)
|
41 |
VOICE_NAME = [
|
42 |
'af',
|
43 |
'af_bella', 'af_sarah', 'am_adam', 'am_michael',
|
44 |
'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis',
|
45 |
'af_nicole', 'af_sky',
|
46 |
][0]
|
|
|
47 |
|
48 |
|
49 |
+
VOICEPACK = torch.load(
|
50 |
+
f'voices/{VOICE_NAME}.pt', weights_only=True).to(device)
|
|
|
51 |
|
52 |
+
AUDIO_FILES_DIR = 'audio_files'
|
53 |
+
os.makedirs(AUDIO_FILES_DIR, exist_ok=True)
|
54 |
+
|
55 |
+
PRE_CHUNK_LIMIT_BYTES = frame_size * 2 * 20
|
56 |
+
|
57 |
+
transcription_pool = futures.ThreadPoolExecutor(max_workers=10)
|
58 |
+
|
59 |
+
terminators = ['.', '?', '!', '...', '…', '?!', '!?', '‽', '。', '؟', '۔']
|
60 |
+
|
61 |
+
BLACKLIST = {
|
62 |
+
"Give me a minute.",
|
63 |
+
"Let me check the details.",
|
64 |
+
"Give me a minute. Let me check the details."
|
65 |
+
}
|
66 |
+
|
67 |
+
|
68 |
+
dummy_audio = np.frombuffer(
|
69 |
+
np.zeros(int(16_000 * 5.0), dtype=np.float32), dtype=np.int16).astype(np.float32) / 32768.0
|
70 |
+
|
71 |
+
|
72 |
+
def _fw_transcribe_block(audio_f32: np.ndarray) -> dict:
|
73 |
+
segments, info = whisper_model.transcribe(
|
74 |
+
audio_f32,
|
75 |
+
language="en",
|
76 |
+
beam_size=1,
|
77 |
+
vad_filter=False,
|
78 |
+
initial_prompt="Indian English accent; do not make up words.",
|
79 |
+
no_speech_threshold=0.25,
|
80 |
+
log_prob_threshold=-0.6,
|
81 |
+
temperature=0
|
82 |
+
)
|
83 |
+
text = "".join(seg.text for seg in segments)
|
84 |
+
return {"text": text, "language": info.language,
|
85 |
+
"language_probability": info.language_probability}
|
86 |
+
|
87 |
+
|
88 |
+
async def safe_transcribe(audio_float32: np.ndarray):
|
89 |
+
loop = asyncio.get_running_loop()
|
90 |
+
return await loop.run_in_executor(
|
91 |
+
transcription_pool,
|
92 |
+
lambda: _fw_transcribe_block(audio_float32)
|
93 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
|
96 |
class TextToSpeechServicer(text_to_speech_pb2_grpc.TextToSpeechServiceServicer):
|
97 |
+
|
98 |
+
def __init__(self):
|
99 |
+
super().__init__()
|
100 |
+
self._transcribe_lock = asyncio.Lock()
|
101 |
+
|
102 |
+
async def ProcessText(self, request_iterator, context):
|
103 |
try:
|
104 |
+
global VOICEPACK
|
105 |
+
|
106 |
+
print("New connection")
|
107 |
+
|
108 |
+
tts_queue = asyncio.Queue()
|
109 |
+
response_queue = asyncio.Queue()
|
110 |
+
|
111 |
parameters = {
|
112 |
"processing_active": False,
|
113 |
"queue": deque(),
|
114 |
"file_number": 0,
|
115 |
"session_id": "",
|
116 |
+
"interrupt_seq": 0,
|
117 |
+
"temperature": 1,
|
118 |
+
"activeVoice": "af",
|
119 |
+
"in_speech": False,
|
120 |
+
"maxTokens": 500,
|
121 |
+
"audio_buffer": bytearray(),
|
122 |
+
"pre_chunks": bytearray(),
|
123 |
+
"silence_counter": 0.0,
|
124 |
+
"silence_duration": 0.8, # default duration in seconds
|
125 |
+
"silence_threshold": 800, # default amplitude threshold
|
126 |
+
"VOICEPACK": VOICEPACK,
|
127 |
+
"audio_count": 0,
|
128 |
+
"user_sequence": 0,
|
129 |
+
"last_file_number": 0
|
130 |
}
|
131 |
+
|
132 |
+
reader = asyncio.create_task(
|
133 |
+
self._read_requests(request_iterator, tts_queue, response_queue, parameters))
|
134 |
+
|
135 |
+
tts = asyncio.create_task(self._tts_queue_worker(
|
136 |
+
tts_queue, response_queue, parameters))
|
137 |
+
|
138 |
+
try:
|
139 |
+
while True:
|
140 |
+
resp = await response_queue.get()
|
141 |
+
if resp is None:
|
142 |
+
break
|
143 |
+
yield resp
|
144 |
+
finally:
|
145 |
+
reader.cancel()
|
146 |
+
tts.cancel()
|
147 |
+
|
148 |
+
except Exception as e:
|
149 |
+
print("Error in ProcessText:", e)
|
150 |
+
|
151 |
+
async def _read_requests(self, request_iterator, tts_queue: asyncio.Queue, response_queue: asyncio.Queue, parameters):
|
152 |
+
async for request in request_iterator:
|
153 |
+
field = request.WhichOneof('request_data')
|
154 |
+
if field == 'metadata':
|
155 |
+
meta = request.metadata
|
156 |
+
# print("\n\nMetadata : ", meta)
|
157 |
+
if meta.session_id:
|
158 |
+
parameters["session_id"] = meta.session_id
|
159 |
+
if meta.temperature:
|
160 |
+
parameters["temperature"] = meta.temperature
|
161 |
+
if meta.maxTokens:
|
162 |
+
parameters["maxTokens"] = meta.maxTokens
|
163 |
+
if meta.activeVoice:
|
164 |
+
parameters["activeVoice"] = meta.activeVoice
|
165 |
+
parameters["VOICEPACK"] = torch.load(
|
166 |
+
f'voices/{parameters["activeVoice"]}.pt', weights_only=True).to(device)
|
167 |
+
print("\n\nVoice model loaded successfully")
|
168 |
+
if meta.silenceDuration:
|
169 |
+
silence_duration = meta.silenceDuration / 1000
|
170 |
+
parameters["silence_duration"] = silence_duration
|
171 |
+
if meta.threshold:
|
172 |
+
parameters["silence_threshold"] = meta.threshold
|
173 |
+
|
174 |
+
print("\n\nPatameter : ", parameters)
|
175 |
+
|
176 |
+
# output = await safe_transcribe("output2.wav")
|
177 |
+
|
178 |
+
resp = text_to_speech_pb2.ProcessTextResponse(
|
179 |
+
buffer=dummy_bytes(),
|
180 |
+
session_id=parameters["session_id"],
|
181 |
+
sequence_id="-10",
|
182 |
+
transcript="",
|
183 |
+
)
|
184 |
+
await response_queue.put(resp)
|
185 |
+
|
186 |
+
continue
|
187 |
+
elif field == 'audio_data':
|
188 |
+
|
189 |
+
buffer = request.audio_data.buffer
|
190 |
+
|
191 |
+
audio_data = np.frombuffer(buffer, dtype=np.int16)
|
192 |
+
|
193 |
+
float_chunk = audio_data.astype(np.float32) / 32768.0
|
194 |
+
|
195 |
+
vad_result = vad_iter(float_chunk)
|
196 |
+
|
197 |
+
parameters["pre_chunks"].extend(buffer)
|
198 |
+
if len(parameters["pre_chunks"]) > PRE_CHUNK_LIMIT_BYTES:
|
199 |
+
overflow = len(
|
200 |
+
parameters["pre_chunks"]) - PRE_CHUNK_LIMIT_BYTES
|
201 |
+
del parameters["pre_chunks"][:overflow]
|
202 |
+
|
203 |
+
if vad_result:
|
204 |
+
if "start" in vad_result:
|
205 |
+
parameters["in_speech"] = True
|
206 |
+
parameters["audio_buffer"].extend(
|
207 |
+
parameters["pre_chunks"])
|
208 |
+
if "end" in vad_result:
|
209 |
+
parameters["in_speech"] = False
|
210 |
+
|
211 |
+
if parameters["in_speech"]:
|
212 |
+
parameters["audio_buffer"].extend(buffer)
|
213 |
+
parameters["silence_counter"] = 0.0
|
214 |
+
parameters["audio_count"] += 1
|
215 |
+
else:
|
216 |
+
sample_rate = 16000
|
217 |
+
duration = len(audio_data) / sample_rate
|
218 |
+
parameters["silence_counter"] += duration
|
219 |
+
|
220 |
+
if parameters["silence_counter"] >= parameters["silence_duration"]:
|
221 |
+
parameters["silence_counter"] = 0.0
|
222 |
+
if parameters["audio_count"] < 2:
|
223 |
+
parameters["audio_count"] = 0
|
224 |
continue
|
225 |
+
parameters["audio_count"] = 0
|
226 |
+
print("Silence ")
|
227 |
+
|
228 |
+
resp = text_to_speech_pb2.ProcessTextResponse(
|
229 |
+
buffer=dummy_bytes(),
|
230 |
session_id=parameters["session_id"],
|
231 |
+
sequence_id="-3",
|
232 |
+
transcript="",
|
233 |
)
|
234 |
+
await response_queue.put(resp)
|
235 |
+
|
236 |
+
# resp = text_to_speech_pb2.ProcessTextResponse(
|
237 |
+
# buffer=dummy_bytes(),
|
238 |
+
# session_id=parameters["session_id"],
|
239 |
+
# sequence_id="0",
|
240 |
+
# transcript="",
|
241 |
+
# )
|
242 |
+
|
243 |
+
# await response_queue.put(resp)
|
244 |
+
|
245 |
+
sample_rate = 16000
|
246 |
+
|
247 |
+
audio_float = np.frombuffer(
|
248 |
+
parameters["audio_buffer"], dtype=np.int16).astype(np.float32) / 32768.0
|
249 |
+
|
250 |
+
parameters["audio_buffer"] = bytearray()
|
251 |
+
|
252 |
+
whisper_start_time = time.time()
|
253 |
+
result = ""
|
254 |
+
try:
|
255 |
+
result = await safe_transcribe(audio_float)
|
256 |
+
except Exception as e:
|
257 |
+
await tts_queue.put(("Sorry! I am not able to catch that can you repeat again please!", parameters["file_number"]))
|
258 |
+
print("Error in transcribing text : ", e)
|
259 |
+
continue
|
260 |
+
|
261 |
+
whisper_end_time = time.time()
|
262 |
+
time_taken_to_transcribe = whisper_end_time - whisper_start_time
|
263 |
+
print(
|
264 |
+
f"Transcribing time: {time_taken_to_transcribe:.4f} seconds")
|
265 |
+
transcribed_text = result["text"]
|
266 |
+
print(
|
267 |
+
f"Transcribed Text :", transcribed_text)
|
268 |
+
|
269 |
+
if not transcribed_text.strip():
|
270 |
+
resp = text_to_speech_pb2.ProcessTextResponse(
|
271 |
+
buffer=dummy_bytes(),
|
272 |
+
session_id=parameters["session_id"],
|
273 |
+
sequence_id="-5",
|
274 |
+
transcript="",
|
275 |
+
)
|
276 |
+
await response_queue.put(resp)
|
277 |
+
continue
|
278 |
+
|
279 |
+
|
280 |
+
# Transcript Detected ------------------------------------------------------------------------------------
|
281 |
+
|
282 |
+
if transcribed_text:
|
283 |
+
parameters["queue"].clear()
|
284 |
+
parameters["user_sequence"] += 1
|
285 |
+
parameters["last_file_number"] = parameters["file_number"]
|
286 |
+
while not response_queue.empty():
|
287 |
+
try:
|
288 |
+
response_queue.get_nowait()
|
289 |
+
response_queue.task_done()
|
290 |
+
except asyncio.QueueEmpty:
|
291 |
+
break
|
292 |
+
while not tts_queue.empty():
|
293 |
+
try:
|
294 |
+
tts_queue.get_nowait()
|
295 |
+
tts_queue.task_done()
|
296 |
+
except asyncio.QueueEmpty:
|
297 |
+
break
|
298 |
+
|
299 |
+
resp = text_to_speech_pb2.ProcessTextResponse(
|
300 |
+
buffer=dummy_bytes(),
|
301 |
+
session_id=parameters["session_id"],
|
302 |
+
sequence_id="-4",
|
303 |
+
transcript="",
|
304 |
+
)
|
305 |
+
await response_queue.put(resp)
|
306 |
+
resp = text_to_speech_pb2.ProcessTextResponse(
|
307 |
+
buffer=dummy_bytes(),
|
308 |
+
session_id=parameters["session_id"],
|
309 |
+
sequence_id="-2",
|
310 |
+
transcript=transcribed_text,
|
311 |
+
)
|
312 |
+
save_chat_entry(
|
313 |
+
parameters["session_id"], "user", transcribed_text)
|
314 |
+
await response_queue.put(resp)
|
315 |
+
|
316 |
+
try:
|
317 |
+
filler = random.choice(filler_phrases)
|
318 |
+
# await tts_queue.put((filler, parameters["file_number"]))
|
319 |
+
loop = asyncio.get_event_loop()
|
320 |
+
|
321 |
+
loop.call_later(
|
322 |
+
0,
|
323 |
+
# 1.0,
|
324 |
+
lambda: asyncio.create_task(
|
325 |
+
tts_queue.put(
|
326 |
+
(filler, parameters["file_number"]))
|
327 |
+
)
|
328 |
+
)
|
329 |
+
|
330 |
+
except Exception as e:
|
331 |
+
print("Error in sendign error : ", e)
|
332 |
+
final_response = ""
|
333 |
+
complete_response = ""
|
334 |
+
current_user_sequence = parameters["user_sequence"]
|
335 |
+
response = await getResponseAsync(
|
336 |
+
transcribed_text, parameters["session_id"])
|
337 |
+
if response is None:
|
338 |
+
continue
|
339 |
+
for chunk in response:
|
340 |
+
if (current_user_sequence != parameters["user_sequence"]):
|
341 |
+
break
|
342 |
+
msg = chunk.choices[0].delta.content
|
343 |
+
if msg:
|
344 |
+
complete_response += msg
|
345 |
+
m = re.search(r'[.?!]', msg)
|
346 |
+
if m:
|
347 |
+
idx = m.start()
|
348 |
+
segment = msg[:idx+1]
|
349 |
+
leftover = msg[idx+1:]
|
350 |
+
else:
|
351 |
+
segment, leftover = msg, ''
|
352 |
+
|
353 |
+
final_response += segment
|
354 |
+
|
355 |
+
if segment.endswith(('.', '!', '?')):
|
356 |
+
parameters["file_number"] += 1
|
357 |
+
parameters["queue"].append(
|
358 |
+
(final_response, parameters["file_number"]))
|
359 |
+
await tts_queue.put((final_response, parameters["file_number"]))
|
360 |
+
final_response = leftover
|
361 |
+
|
362 |
+
if final_response.strip():
|
363 |
+
parameters["file_number"] += 1
|
364 |
+
parameters["queue"].append(
|
365 |
+
(final_response, parameters["file_number"]))
|
366 |
+
await tts_queue.put((final_response, parameters["file_number"]))
|
367 |
+
|
368 |
+
if ("Let me check" in complete_response):
|
369 |
+
final_response = ""
|
370 |
+
complete_response = ""
|
371 |
+
current_user_sequence = parameters["user_sequence"]
|
372 |
+
response = await getResponseWithRagAsync(
|
373 |
+
transcribed_text, parameters["session_id"])
|
374 |
+
for chunk in response:
|
375 |
+
if (current_user_sequence != parameters["user_sequence"]):
|
376 |
+
break
|
377 |
+
msg = chunk.choices[0].delta.content
|
378 |
+
if msg:
|
379 |
+
m = re.search(r'[.?!]', msg)
|
380 |
+
if m:
|
381 |
+
idx = m.start()
|
382 |
+
segment = msg[:idx+1]
|
383 |
+
leftover = msg[idx+1:]
|
384 |
+
else:
|
385 |
+
segment, leftover = msg, ''
|
386 |
+
|
387 |
+
final_response += segment
|
388 |
+
complete_response += segment
|
389 |
+
|
390 |
+
if segment.endswith(('.', '!', '?')):
|
391 |
+
parameters["file_number"] += 1
|
392 |
+
parameters["queue"].append(
|
393 |
+
(final_response, parameters["file_number"]))
|
394 |
+
await tts_queue.put((final_response, parameters["file_number"]))
|
395 |
+
final_response = leftover
|
396 |
+
|
397 |
+
if final_response.strip():
|
398 |
parameters["file_number"] += 1
|
399 |
+
parameters["queue"].append(
|
400 |
+
(final_response, parameters["file_number"]))
|
401 |
+
await tts_queue.put((final_response, parameters["file_number"]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
402 |
|
403 |
+
continue
|
404 |
+
|
405 |
+
elif field == 'status':
|
406 |
+
transcript = request.status.transcript
|
407 |
+
played_seq = request.status.played_seq
|
408 |
+
interrupt_seq = request.status.interrupt_seq
|
409 |
+
parameters["interrupt_seq"] = interrupt_seq
|
410 |
+
text = transcript.strip() if transcript else ""
|
411 |
+
if text and text not in BLACKLIST:
|
412 |
+
save_chat_entry(
|
413 |
+
parameters["session_id"],
|
414 |
+
"assistant",
|
415 |
+
transcript
|
416 |
+
)
|
417 |
+
continue
|
418 |
+
else:
|
419 |
+
continue
|
420 |
+
|
421 |
+
async def _tts_queue_worker(self, tts_queue: asyncio.Queue,
|
422 |
+
response_queue: asyncio.Queue,
|
423 |
+
params: dict):
|
424 |
+
"""
|
425 |
+
Pull (text, seq) off tts_queue, run generate_audio_stream, wrap each chunk
|
426 |
+
in ProcessTextResponse, and push into response_queue.
|
427 |
+
"""
|
428 |
+
while True:
|
429 |
+
item = await tts_queue.get()
|
430 |
+
tts_queue.task_done()
|
431 |
+
if item is None:
|
432 |
+
break
|
433 |
+
|
434 |
+
sentence, seq = item
|
435 |
+
# drop anything the client has already played:
|
436 |
+
if seq <= int(params["interrupt_seq"]):
|
437 |
+
continue
|
438 |
+
|
439 |
+
# stream the audio chunks, pack into gRPC responses
|
440 |
+
async for audio_chunk in generate_audio_stream(
|
441 |
+
sentence, MODEL, params["VOICEPACK"], VOICE_NAME
|
442 |
+
):
|
443 |
+
audio_bytes = get_audio_bytes(audio_chunk)
|
444 |
+
if seq <= int(params["last_file_number"]):
|
445 |
break
|
446 |
+
resp = text_to_speech_pb2.ProcessTextResponse(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
447 |
buffer=audio_bytes,
|
448 |
+
session_id=params["session_id"],
|
449 |
+
sequence_id=str(seq),
|
450 |
transcript=sentence,
|
451 |
)
|
452 |
+
await response_queue.put(resp)
|
|
|
|
|
453 |
|
454 |
|
455 |
+
async def serve():
|
456 |
print("Starting gRPC server...")
|
457 |
+
|
458 |
+
# Use grpc.aio.server for the gRPC async server
|
459 |
+
server = grpc.aio.server(futures.ThreadPoolExecutor(max_workers=10))
|
460 |
+
text_to_speech_pb2_grpc.add_TextToSpeechServiceServicer_to_server(
|
461 |
+
TextToSpeechServicer(), server)
|
462 |
server.add_insecure_port('[::]:8081')
|
463 |
+
|
464 |
+
await server.start()
|
465 |
print("gRPC server is running on port 8081")
|
|
|
466 |
|
467 |
+
# The serve method should wait for the server to terminate asynchronously
|
468 |
+
await server.wait_for_termination()
|
469 |
|
470 |
if __name__ == "__main__":
|
471 |
+
# Use asyncio.run to run the asynchronous serve function
|
472 |
+
asyncio.run(serve())
|
app2.py
ADDED
@@ -0,0 +1,456 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from concurrent import futures
|
2 |
+
import asyncio
|
3 |
+
import torch
|
4 |
+
from models import build_model
|
5 |
+
from collections import deque
|
6 |
+
import grpc
|
7 |
+
import text_to_speech_pb2
|
8 |
+
import text_to_speech_pb2_grpc
|
9 |
+
from chat_database import save_chat_entry
|
10 |
+
import fastAPI
|
11 |
+
from providers.audio_provider import get_audio_bytes, dummy_bytes, generate_audio_stream
|
12 |
+
from providers.llm_provider import getResponseWithRagAsync, getResponseAsync
|
13 |
+
import whisper
|
14 |
+
import numpy as np
|
15 |
+
import os
|
16 |
+
import re
|
17 |
+
import time
|
18 |
+
from silero_vad import load_silero_vad, VADIterator
|
19 |
+
import random
|
20 |
+
from providers.filler_words import filler_phrases
|
21 |
+
from scipy.io.wavfile import write
|
22 |
+
|
23 |
+
sampling_rate = 16_000
|
24 |
+
vad_model = load_silero_vad()
|
25 |
+
vad_iter = VADIterator(vad_model, sampling_rate=sampling_rate)
|
26 |
+
frame_size = 512
|
27 |
+
|
28 |
+
|
29 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
30 |
+
whisper_model = whisper.load_model("small", device=device).to(device).eval()
|
31 |
+
# whisper_model = torch.compile(whisper_model)
|
32 |
+
|
33 |
+
MODEL = build_model('kokoro-v0_19.pth', device)
|
34 |
+
VOICE_NAME = [
|
35 |
+
'af',
|
36 |
+
'af_bella', 'af_sarah', 'am_adam', 'am_michael',
|
37 |
+
'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis',
|
38 |
+
'af_nicole', 'af_sky',
|
39 |
+
][0]
|
40 |
+
|
41 |
+
|
42 |
+
VOICEPACK = torch.load(
|
43 |
+
f'voices/{VOICE_NAME}.pt', weights_only=True).to(device)
|
44 |
+
|
45 |
+
AUDIO_FILES_DIR = 'audio_files'
|
46 |
+
os.makedirs(AUDIO_FILES_DIR, exist_ok=True)
|
47 |
+
|
48 |
+
PRE_CHUNK_LIMIT_BYTES = frame_size * 2 * 20
|
49 |
+
|
50 |
+
transcription_pool = futures.ThreadPoolExecutor(max_workers=10)
|
51 |
+
|
52 |
+
# terminators = ['.', '?', '!']
|
53 |
+
|
54 |
+
terminators = ['.', '?', '!', '...', '…', '?!', '!?', '‽', '。', '؟', '۔']
|
55 |
+
|
56 |
+
BLACKLIST = {
|
57 |
+
"Give me a minute.",
|
58 |
+
"Let me check the details.",
|
59 |
+
"Give me a minute. Let me check the details."
|
60 |
+
}
|
61 |
+
|
62 |
+
|
63 |
+
dummy_audio = np.frombuffer(
|
64 |
+
np.zeros(int(16_000 * 5.0), dtype=np.float32), dtype=np.int16).astype(np.float32) / 32768.0
|
65 |
+
|
66 |
+
|
67 |
+
async def safe_transcribe(audio_float32):
|
68 |
+
loop = asyncio.get_running_loop()
|
69 |
+
return await loop.run_in_executor(
|
70 |
+
transcription_pool,
|
71 |
+
lambda: whisper_model.transcribe(audio_float32,
|
72 |
+
language="en",
|
73 |
+
fp16=False,
|
74 |
+
no_speech_threshold=0.25,
|
75 |
+
logprob_threshold=-0.6,
|
76 |
+
prompt="Indian English accent; do not make up words.")
|
77 |
+
)
|
78 |
+
|
79 |
+
|
80 |
+
class TextToSpeechServicer(text_to_speech_pb2_grpc.TextToSpeechServiceServicer):
|
81 |
+
|
82 |
+
def __init__(self):
|
83 |
+
super().__init__()
|
84 |
+
self._transcribe_lock = asyncio.Lock()
|
85 |
+
|
86 |
+
async def ProcessText(self, request_iterator, context):
|
87 |
+
try:
|
88 |
+
global VOICEPACK
|
89 |
+
|
90 |
+
print("New connection")
|
91 |
+
|
92 |
+
tts_queue = asyncio.Queue()
|
93 |
+
response_queue = asyncio.Queue()
|
94 |
+
|
95 |
+
parameters = {
|
96 |
+
"processing_active": False,
|
97 |
+
"queue": deque(),
|
98 |
+
"file_number": 0,
|
99 |
+
"session_id": "",
|
100 |
+
"interrupt_seq": 0,
|
101 |
+
"temperature": 1,
|
102 |
+
"activeVoice": "af",
|
103 |
+
"in_speech": False,
|
104 |
+
"maxTokens": 500,
|
105 |
+
"audio_buffer": bytearray(),
|
106 |
+
"pre_chunks": bytearray(),
|
107 |
+
"silence_counter": 0.0,
|
108 |
+
"silence_duration": 0.8, # default duration in seconds
|
109 |
+
"silence_threshold": 800, # default amplitude threshold
|
110 |
+
"VOICEPACK": VOICEPACK,
|
111 |
+
"audio_count": 0,
|
112 |
+
"user_sequence": 0,
|
113 |
+
"last_file_number": 0
|
114 |
+
}
|
115 |
+
|
116 |
+
reader = asyncio.create_task(
|
117 |
+
self._read_requests(request_iterator, tts_queue, response_queue, parameters))
|
118 |
+
|
119 |
+
tts = asyncio.create_task(self._tts_queue_worker(
|
120 |
+
tts_queue, response_queue, parameters))
|
121 |
+
|
122 |
+
try:
|
123 |
+
while True:
|
124 |
+
resp = await response_queue.get()
|
125 |
+
if resp is None:
|
126 |
+
break
|
127 |
+
yield resp
|
128 |
+
finally:
|
129 |
+
reader.cancel()
|
130 |
+
tts.cancel()
|
131 |
+
|
132 |
+
except Exception as e:
|
133 |
+
print("Error in ProcessText:", e)
|
134 |
+
|
135 |
+
async def _read_requests(self, request_iterator, tts_queue: asyncio.Queue, response_queue: asyncio.Queue, parameters):
|
136 |
+
async for request in request_iterator:
|
137 |
+
field = request.WhichOneof('request_data')
|
138 |
+
if field == 'metadata':
|
139 |
+
meta = request.metadata
|
140 |
+
# print("\n\nMetadata : ", meta)
|
141 |
+
if meta.session_id:
|
142 |
+
parameters["session_id"] = meta.session_id
|
143 |
+
if meta.temperature:
|
144 |
+
parameters["temperature"] = meta.temperature
|
145 |
+
if meta.maxTokens:
|
146 |
+
parameters["maxTokens"] = meta.maxTokens
|
147 |
+
if meta.activeVoice:
|
148 |
+
parameters["activeVoice"] = meta.activeVoice
|
149 |
+
parameters["VOICEPACK"] = torch.load(
|
150 |
+
f'voices/{parameters["activeVoice"]}.pt', weights_only=True).to(device)
|
151 |
+
print("\n\nVoice model loaded successfully")
|
152 |
+
if meta.silenceDuration:
|
153 |
+
silence_duration = meta.silenceDuration / 1000
|
154 |
+
parameters["silence_duration"] = silence_duration
|
155 |
+
if meta.threshold:
|
156 |
+
parameters["silence_threshold"] = meta.threshold
|
157 |
+
|
158 |
+
print("\n\nPatameter : ", parameters)
|
159 |
+
|
160 |
+
# output = await safe_transcribe("output2.wav")
|
161 |
+
|
162 |
+
resp = text_to_speech_pb2.ProcessTextResponse(
|
163 |
+
buffer=dummy_bytes(),
|
164 |
+
session_id=parameters["session_id"],
|
165 |
+
sequence_id="-10",
|
166 |
+
transcript="",
|
167 |
+
)
|
168 |
+
await response_queue.put(resp)
|
169 |
+
|
170 |
+
continue
|
171 |
+
elif field == 'audio_data':
|
172 |
+
|
173 |
+
buffer = request.audio_data.buffer
|
174 |
+
|
175 |
+
audio_data = np.frombuffer(buffer, dtype=np.int16)
|
176 |
+
|
177 |
+
float_chunk = audio_data.astype(np.float32) / 32768.0
|
178 |
+
|
179 |
+
vad_result = vad_iter(float_chunk)
|
180 |
+
|
181 |
+
parameters["pre_chunks"].extend(buffer)
|
182 |
+
if len(parameters["pre_chunks"]) > PRE_CHUNK_LIMIT_BYTES:
|
183 |
+
overflow = len(
|
184 |
+
parameters["pre_chunks"]) - PRE_CHUNK_LIMIT_BYTES
|
185 |
+
del parameters["pre_chunks"][:overflow]
|
186 |
+
|
187 |
+
if vad_result:
|
188 |
+
if "start" in vad_result:
|
189 |
+
parameters["in_speech"] = True
|
190 |
+
parameters["audio_buffer"].extend(
|
191 |
+
parameters["pre_chunks"])
|
192 |
+
if "end" in vad_result:
|
193 |
+
parameters["in_speech"] = False
|
194 |
+
|
195 |
+
if parameters["in_speech"]:
|
196 |
+
parameters["audio_buffer"].extend(buffer)
|
197 |
+
parameters["silence_counter"] = 0.0
|
198 |
+
parameters["audio_count"] += 1
|
199 |
+
else:
|
200 |
+
sample_rate = 16000
|
201 |
+
duration = len(audio_data) / sample_rate
|
202 |
+
parameters["silence_counter"] += duration
|
203 |
+
|
204 |
+
if parameters["silence_counter"] >= parameters["silence_duration"]:
|
205 |
+
parameters["silence_counter"] = 0.0
|
206 |
+
if parameters["audio_count"] < 2:
|
207 |
+
parameters["audio_count"] = 0
|
208 |
+
continue
|
209 |
+
parameters["audio_count"] = 0
|
210 |
+
print("Silence ")
|
211 |
+
|
212 |
+
resp = text_to_speech_pb2.ProcessTextResponse(
|
213 |
+
buffer=dummy_bytes(),
|
214 |
+
session_id=parameters["session_id"],
|
215 |
+
sequence_id="-3",
|
216 |
+
transcript="",
|
217 |
+
)
|
218 |
+
await response_queue.put(resp)
|
219 |
+
|
220 |
+
# resp = text_to_speech_pb2.ProcessTextResponse(
|
221 |
+
# buffer=dummy_bytes(),
|
222 |
+
# session_id=parameters["session_id"],
|
223 |
+
# sequence_id="0",
|
224 |
+
# transcript="",
|
225 |
+
# )
|
226 |
+
|
227 |
+
# await response_queue.put(resp)
|
228 |
+
|
229 |
+
sample_rate = 16000
|
230 |
+
|
231 |
+
audio_float = np.frombuffer(
|
232 |
+
parameters["audio_buffer"], dtype=np.int16).astype(np.float32) / 32768.0
|
233 |
+
|
234 |
+
parameters["audio_buffer"] = bytearray()
|
235 |
+
|
236 |
+
whisper_start_time = time.time()
|
237 |
+
result = ""
|
238 |
+
try:
|
239 |
+
result = await safe_transcribe(audio_float)
|
240 |
+
except Exception as e:
|
241 |
+
await tts_queue.put(("Sorry! I am not able to catch that can you repeat again please!", parameters["file_number"]))
|
242 |
+
print("Error in transcribing text : ", e)
|
243 |
+
continue
|
244 |
+
|
245 |
+
whisper_end_time = time.time()
|
246 |
+
time_taken_to_transcribe = whisper_end_time - whisper_start_time
|
247 |
+
print(
|
248 |
+
f"Transcribing time: {time_taken_to_transcribe:.4f} seconds")
|
249 |
+
transcribed_text = result["text"]
|
250 |
+
print(
|
251 |
+
f"Transcribed Text :", transcribed_text)
|
252 |
+
|
253 |
+
if not transcribed_text.strip():
|
254 |
+
resp = text_to_speech_pb2.ProcessTextResponse(
|
255 |
+
buffer=dummy_bytes(),
|
256 |
+
session_id=parameters["session_id"],
|
257 |
+
sequence_id="-5",
|
258 |
+
transcript="",
|
259 |
+
)
|
260 |
+
await response_queue.put(resp)
|
261 |
+
continue
|
262 |
+
|
263 |
+
|
264 |
+
# Transcript Detected ------------------------------------------------------------------------------------
|
265 |
+
|
266 |
+
if transcribed_text:
|
267 |
+
parameters["queue"].clear()
|
268 |
+
parameters["user_sequence"] += 1
|
269 |
+
parameters["last_file_number"] = parameters["file_number"]
|
270 |
+
while not response_queue.empty():
|
271 |
+
try:
|
272 |
+
response_queue.get_nowait()
|
273 |
+
response_queue.task_done()
|
274 |
+
except asyncio.QueueEmpty:
|
275 |
+
break
|
276 |
+
while not tts_queue.empty():
|
277 |
+
try:
|
278 |
+
tts_queue.get_nowait()
|
279 |
+
tts_queue.task_done()
|
280 |
+
except asyncio.QueueEmpty:
|
281 |
+
break
|
282 |
+
|
283 |
+
resp = text_to_speech_pb2.ProcessTextResponse(
|
284 |
+
buffer=dummy_bytes(),
|
285 |
+
session_id=parameters["session_id"],
|
286 |
+
sequence_id="-4",
|
287 |
+
transcript="",
|
288 |
+
)
|
289 |
+
await response_queue.put(resp)
|
290 |
+
resp = text_to_speech_pb2.ProcessTextResponse(
|
291 |
+
buffer=dummy_bytes(),
|
292 |
+
session_id=parameters["session_id"],
|
293 |
+
sequence_id="-2",
|
294 |
+
transcript=transcribed_text,
|
295 |
+
)
|
296 |
+
save_chat_entry(
|
297 |
+
parameters["session_id"], "user", transcribed_text)
|
298 |
+
await response_queue.put(resp)
|
299 |
+
|
300 |
+
try:
|
301 |
+
filler = random.choice(filler_phrases)
|
302 |
+
# await tts_queue.put((filler, parameters["file_number"]))
|
303 |
+
loop = asyncio.get_event_loop()
|
304 |
+
|
305 |
+
loop.call_later(
|
306 |
+
0,
|
307 |
+
# 1.0,
|
308 |
+
lambda: asyncio.create_task(
|
309 |
+
tts_queue.put(
|
310 |
+
(filler, parameters["file_number"]))
|
311 |
+
)
|
312 |
+
)
|
313 |
+
|
314 |
+
except Exception as e:
|
315 |
+
print("Error in sendign error : ", e)
|
316 |
+
final_response = ""
|
317 |
+
complete_response = ""
|
318 |
+
current_user_sequence = parameters["user_sequence"]
|
319 |
+
response = await getResponseAsync(
|
320 |
+
transcribed_text, parameters["session_id"])
|
321 |
+
if response is None:
|
322 |
+
continue
|
323 |
+
for chunk in response:
|
324 |
+
if (current_user_sequence != parameters["user_sequence"]):
|
325 |
+
break
|
326 |
+
msg = chunk.choices[0].delta.content
|
327 |
+
if msg:
|
328 |
+
complete_response += msg
|
329 |
+
m = re.search(r'[.?!]', msg)
|
330 |
+
if m:
|
331 |
+
idx = m.start()
|
332 |
+
segment = msg[:idx+1]
|
333 |
+
leftover = msg[idx+1:]
|
334 |
+
else:
|
335 |
+
segment, leftover = msg, ''
|
336 |
+
|
337 |
+
final_response += segment
|
338 |
+
|
339 |
+
if segment.endswith(('.', '!', '?')):
|
340 |
+
parameters["file_number"] += 1
|
341 |
+
parameters["queue"].append(
|
342 |
+
(final_response, parameters["file_number"]))
|
343 |
+
await tts_queue.put((final_response, parameters["file_number"]))
|
344 |
+
final_response = leftover
|
345 |
+
|
346 |
+
if final_response.strip():
|
347 |
+
parameters["file_number"] += 1
|
348 |
+
parameters["queue"].append(
|
349 |
+
(final_response, parameters["file_number"]))
|
350 |
+
await tts_queue.put((final_response, parameters["file_number"]))
|
351 |
+
|
352 |
+
if ("Let me check" in complete_response):
|
353 |
+
final_response = ""
|
354 |
+
complete_response = ""
|
355 |
+
current_user_sequence = parameters["user_sequence"]
|
356 |
+
response = await getResponseWithRagAsync(
|
357 |
+
transcribed_text, parameters["session_id"])
|
358 |
+
for chunk in response:
|
359 |
+
if (current_user_sequence != parameters["user_sequence"]):
|
360 |
+
break
|
361 |
+
msg = chunk.choices[0].delta.content
|
362 |
+
if msg:
|
363 |
+
m = re.search(r'[.?!]', msg)
|
364 |
+
if m:
|
365 |
+
idx = m.start()
|
366 |
+
segment = msg[:idx+1]
|
367 |
+
leftover = msg[idx+1:]
|
368 |
+
else:
|
369 |
+
segment, leftover = msg, ''
|
370 |
+
|
371 |
+
final_response += segment
|
372 |
+
complete_response += segment
|
373 |
+
|
374 |
+
if segment.endswith(('.', '!', '?')):
|
375 |
+
parameters["file_number"] += 1
|
376 |
+
parameters["queue"].append(
|
377 |
+
(final_response, parameters["file_number"]))
|
378 |
+
await tts_queue.put((final_response, parameters["file_number"]))
|
379 |
+
final_response = leftover
|
380 |
+
|
381 |
+
if final_response.strip():
|
382 |
+
parameters["file_number"] += 1
|
383 |
+
parameters["queue"].append(
|
384 |
+
(final_response, parameters["file_number"]))
|
385 |
+
await tts_queue.put((final_response, parameters["file_number"]))
|
386 |
+
|
387 |
+
continue
|
388 |
+
|
389 |
+
elif field == 'status':
|
390 |
+
transcript = request.status.transcript
|
391 |
+
played_seq = request.status.played_seq
|
392 |
+
interrupt_seq = request.status.interrupt_seq
|
393 |
+
parameters["interrupt_seq"] = interrupt_seq
|
394 |
+
text = transcript.strip() if transcript else ""
|
395 |
+
if text and text not in BLACKLIST:
|
396 |
+
save_chat_entry(
|
397 |
+
parameters["session_id"],
|
398 |
+
"assistant",
|
399 |
+
transcript
|
400 |
+
)
|
401 |
+
continue
|
402 |
+
else:
|
403 |
+
continue
|
404 |
+
|
405 |
+
async def _tts_queue_worker(self, tts_queue: asyncio.Queue,
|
406 |
+
response_queue: asyncio.Queue,
|
407 |
+
params: dict):
|
408 |
+
"""
|
409 |
+
Pull (text, seq) off tts_queue, run generate_audio_stream, wrap each chunk
|
410 |
+
in ProcessTextResponse, and push into response_queue.
|
411 |
+
"""
|
412 |
+
while True:
|
413 |
+
item = await tts_queue.get()
|
414 |
+
tts_queue.task_done()
|
415 |
+
if item is None:
|
416 |
+
break
|
417 |
+
|
418 |
+
sentence, seq = item
|
419 |
+
# drop anything the client has already played:
|
420 |
+
if seq <= int(params["interrupt_seq"]):
|
421 |
+
continue
|
422 |
+
|
423 |
+
# stream the audio chunks, pack into gRPC responses
|
424 |
+
async for audio_chunk in generate_audio_stream(
|
425 |
+
sentence, MODEL, params["VOICEPACK"], VOICE_NAME
|
426 |
+
):
|
427 |
+
audio_bytes = get_audio_bytes(audio_chunk)
|
428 |
+
if seq <= int(params["last_file_number"]):
|
429 |
+
break
|
430 |
+
resp = text_to_speech_pb2.ProcessTextResponse(
|
431 |
+
buffer=audio_bytes,
|
432 |
+
session_id=params["session_id"],
|
433 |
+
sequence_id=str(seq),
|
434 |
+
transcript=sentence,
|
435 |
+
)
|
436 |
+
await response_queue.put(resp)
|
437 |
+
|
438 |
+
|
439 |
+
async def serve():
|
440 |
+
print("Starting gRPC server...")
|
441 |
+
|
442 |
+
# Use grpc.aio.server for the gRPC async server
|
443 |
+
server = grpc.aio.server(futures.ThreadPoolExecutor(max_workers=10))
|
444 |
+
text_to_speech_pb2_grpc.add_TextToSpeechServiceServicer_to_server(
|
445 |
+
TextToSpeechServicer(), server)
|
446 |
+
server.add_insecure_port('[::]:8081')
|
447 |
+
|
448 |
+
await server.start()
|
449 |
+
print("gRPC server is running on port 8081")
|
450 |
+
|
451 |
+
# The serve method should wait for the server to terminate asynchronously
|
452 |
+
await server.wait_for_termination()
|
453 |
+
|
454 |
+
if __name__ == "__main__":
|
455 |
+
# Use asyncio.run to run the asynchronous serve function
|
456 |
+
asyncio.run(serve())
|
app_old.py
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from concurrent import futures
|
2 |
+
import torch
|
3 |
+
from models import build_model
|
4 |
+
from collections import deque
|
5 |
+
import grpc
|
6 |
+
import text_to_speech_pb2
|
7 |
+
import text_to_speech_pb2_grpc
|
8 |
+
from chat_database import save_chat_entry
|
9 |
+
import fastAPI
|
10 |
+
from providers.audio_provider import get_audio_bytes, dummy_bytes, generate_audio_from_chunks
|
11 |
+
from providers.llm_provider import getResponseWithRAG, getResponse
|
12 |
+
|
13 |
+
|
14 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
15 |
+
|
16 |
+
MODEL = build_model('kokoro-v0_19.pth', device)
|
17 |
+
|
18 |
+
VOICE_NAME = [
|
19 |
+
'af',
|
20 |
+
'af_bella', 'af_sarah', 'am_adam', 'am_michael',
|
21 |
+
'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis',
|
22 |
+
'af_nicole', 'af_sky',
|
23 |
+
][0]
|
24 |
+
|
25 |
+
|
26 |
+
VOICEPACK = torch.load(
|
27 |
+
f'voices/{VOICE_NAME}.pt', weights_only=True).to(device)
|
28 |
+
|
29 |
+
|
30 |
+
class TextToSpeechServicer(text_to_speech_pb2_grpc.TextToSpeechServiceServicer):
|
31 |
+
def ProcessText(self, request_iterator , context):
|
32 |
+
try:
|
33 |
+
global VOICEPACK
|
34 |
+
print("Received new request")
|
35 |
+
parameters = {
|
36 |
+
"processing_active": False,
|
37 |
+
"queue": deque(),
|
38 |
+
"file_number": 0,
|
39 |
+
"session_id": "",
|
40 |
+
"interrupt_seq": 0,
|
41 |
+
"temperature": 1,
|
42 |
+
"activeVoice": "af",
|
43 |
+
"maxTokens": 500,
|
44 |
+
}
|
45 |
+
for request in request_iterator:
|
46 |
+
field = request.WhichOneof('request_data')
|
47 |
+
if field == 'metadata':
|
48 |
+
meta = request.metadata
|
49 |
+
print("Metadata received:")
|
50 |
+
print(" session_id:", meta.session_id)
|
51 |
+
print(" silenceDuration:", meta.silenceDuration)
|
52 |
+
print(" threshold:", meta.threshold)
|
53 |
+
print(" temperature:", meta.temperature)
|
54 |
+
print(" activeVoice:", meta.activeVoice)
|
55 |
+
print(" maxTokens:", meta.maxTokens)
|
56 |
+
print("Metadata : ", request.metadata)
|
57 |
+
if meta.session_id:
|
58 |
+
parameters["session_id"] = meta.session_id
|
59 |
+
if meta.temperature:
|
60 |
+
parameters["temperature"] = meta.temperature
|
61 |
+
if meta.maxTokens:
|
62 |
+
parameters["maxTokens"] = meta.maxTokens
|
63 |
+
if meta.activeVoice:
|
64 |
+
parameters["activeVoice"] = meta.activeVoice
|
65 |
+
VOICEPACK = torch.load(
|
66 |
+
f'voices/{parameters["activeVoice"]}.pt', weights_only=True).to(device)
|
67 |
+
continue
|
68 |
+
elif field == 'text':
|
69 |
+
text = request.text
|
70 |
+
if not text:
|
71 |
+
continue
|
72 |
+
# yield text_to_speech_pb2.ProcessTextResponse(
|
73 |
+
# buffer=dummy_bytes(),
|
74 |
+
# session_id=parameters["session_id"],
|
75 |
+
# sequence_id="0",
|
76 |
+
# transcript="",
|
77 |
+
# )
|
78 |
+
# intent = check_for_rag(
|
79 |
+
# text, parameters["session_id"])
|
80 |
+
# print("Intent : ", intent.intent)
|
81 |
+
# print("Intent : ", intent.rag)
|
82 |
+
save_chat_entry(parameters["session_id"], "user", text)
|
83 |
+
parameters["queue"].clear()
|
84 |
+
yield text_to_speech_pb2.ProcessTextResponse(
|
85 |
+
buffer=dummy_bytes(),
|
86 |
+
session_id=parameters["session_id"],
|
87 |
+
sequence_id="-2",
|
88 |
+
transcript=text,
|
89 |
+
)
|
90 |
+
final_response = ""
|
91 |
+
complete_response = ""
|
92 |
+
response = getResponse(text, parameters["session_id"])
|
93 |
+
for chunk in response:
|
94 |
+
msg = chunk.choices[0].delta.content
|
95 |
+
if msg:
|
96 |
+
final_response += msg
|
97 |
+
complete_response += msg
|
98 |
+
if final_response.endswith(('.', '!', '?')):
|
99 |
+
parameters["file_number"] += 1
|
100 |
+
parameters["queue"].append(
|
101 |
+
(final_response, parameters["file_number"]))
|
102 |
+
final_response = ""
|
103 |
+
if not parameters["processing_active"]:
|
104 |
+
yield from self.process_queue(parameters)
|
105 |
+
|
106 |
+
if final_response:
|
107 |
+
parameters["file_number"] += 1
|
108 |
+
parameters["queue"].append(
|
109 |
+
(final_response, parameters["file_number"]))
|
110 |
+
if not parameters["processing_active"]:
|
111 |
+
yield from self.process_queue(parameters)
|
112 |
+
|
113 |
+
if ("Let me check" in complete_response):
|
114 |
+
final_response = ""
|
115 |
+
complete_response = ""
|
116 |
+
response = getResponseWithRAG(
|
117 |
+
text, parameters["session_id"])
|
118 |
+
for chunk in response:
|
119 |
+
msg = chunk.choices[0].delta.content
|
120 |
+
if msg:
|
121 |
+
final_response += msg
|
122 |
+
complete_response += msg
|
123 |
+
if final_response.endswith(('.', '!', '?')):
|
124 |
+
parameters["file_number"] += 1
|
125 |
+
parameters["queue"].append(
|
126 |
+
(final_response, parameters["file_number"]))
|
127 |
+
final_response = ""
|
128 |
+
if not parameters["processing_active"]:
|
129 |
+
yield from self.process_queue(parameters)
|
130 |
+
|
131 |
+
if final_response:
|
132 |
+
parameters["file_number"] += 1
|
133 |
+
parameters["queue"].append(
|
134 |
+
(final_response, parameters["file_number"]))
|
135 |
+
if not parameters["processing_active"]:
|
136 |
+
yield from self.process_queue(parameters)
|
137 |
+
|
138 |
+
elif field == 'status':
|
139 |
+
transcript = request.status.transcript
|
140 |
+
played_seq = request.status.played_seq
|
141 |
+
interrupt_seq = request.status.interrupt_seq
|
142 |
+
parameters["interrupt_seq"] = interrupt_seq
|
143 |
+
save_chat_entry(
|
144 |
+
parameters["session_id"], "assistant", transcript)
|
145 |
+
continue
|
146 |
+
else:
|
147 |
+
continue
|
148 |
+
except Exception as e:
|
149 |
+
print("Error in ProcessText:", e)
|
150 |
+
|
151 |
+
def process_queue(self, parameters):
|
152 |
+
global VOICEPACK
|
153 |
+
try:
|
154 |
+
while True:
|
155 |
+
if not parameters["queue"]:
|
156 |
+
parameters["processing_active"] = False
|
157 |
+
break
|
158 |
+
parameters["processing_active"] = True
|
159 |
+
sentence, file_number = parameters["queue"].popleft()
|
160 |
+
if file_number <= int(parameters["interrupt_seq"]):
|
161 |
+
continue
|
162 |
+
|
163 |
+
combined_audio = generate_audio_from_chunks(
|
164 |
+
sentence, MODEL, VOICEPACK, VOICE_NAME)
|
165 |
+
audio_bytes = get_audio_bytes(combined_audio)
|
166 |
+
# filename = save_audio_to_file(combined_audio, file_number)
|
167 |
+
yield text_to_speech_pb2.ProcessTextResponse(
|
168 |
+
buffer=audio_bytes,
|
169 |
+
session_id=parameters["session_id"],
|
170 |
+
sequence_id=str(file_number),
|
171 |
+
transcript=sentence,
|
172 |
+
)
|
173 |
+
except Exception as e:
|
174 |
+
parameters["processing_active"] = False
|
175 |
+
print("Error in process_queue:", e)
|
176 |
+
|
177 |
+
|
178 |
+
def serve():
|
179 |
+
print("Starting gRPC server...")
|
180 |
+
server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
|
181 |
+
text_to_speech_pb2_grpc.add_TextToSpeechServiceServicer_to_server(
|
182 |
+
TextToSpeechServicer(), server)
|
183 |
+
server.add_insecure_port('[::]:8081')
|
184 |
+
server.start()
|
185 |
+
print("gRPC server is running on port 8081")
|
186 |
+
server.wait_for_termination()
|
187 |
+
|
188 |
+
|
189 |
+
if __name__ == "__main__":
|
190 |
+
serve()
|
backend/.DS_Store
CHANGED
Binary files a/backend/.DS_Store and b/backend/.DS_Store differ
|
|
backend/.gitignore
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
/node_modules
|
2 |
-
.DS_Store
|
|
|
|
|
|
backend/app.js
CHANGED
@@ -9,14 +9,27 @@ app.use(express.urlencoded({ extended: true }));
|
|
9 |
app.use(bodyParser.json());
|
10 |
const port = 8080;
|
11 |
|
12 |
-
const {
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
app.get("/health", (req, res) => {
|
15 |
res.send("Green");
|
16 |
});
|
17 |
|
|
|
|
|
|
|
|
|
18 |
app.ws("/v2v", audio_stream);
|
19 |
|
|
|
|
|
|
|
|
|
20 |
app.listen(port, () => {
|
21 |
-
console.log(`
|
22 |
});
|
|
|
9 |
app.use(bodyParser.json());
|
10 |
const port = 8080;
|
11 |
|
12 |
+
const {
|
13 |
+
audio_stream,
|
14 |
+
} = require("./handle-realtime-tts/sttModelSocket_whisper.js");
|
15 |
+
const chatRouter = require("./routes/chat.routes.js");
|
16 |
+
const ragRouter = require("./routes/rag.routes.js");
|
17 |
+
const promptRouter = require("./routes/prompt.routes.js");
|
18 |
|
19 |
app.get("/health", (req, res) => {
|
20 |
res.send("Green");
|
21 |
});
|
22 |
|
23 |
+
app.get("/", (req, res) => {
|
24 |
+
res.send("VocRT express server working fine.");
|
25 |
+
});
|
26 |
+
|
27 |
app.ws("/v2v", audio_stream);
|
28 |
|
29 |
+
app.use("/chat", chatRouter);
|
30 |
+
app.use("/rag", ragRouter);
|
31 |
+
app.use("/prompt", promptRouter);
|
32 |
+
|
33 |
app.listen(port, () => {
|
34 |
+
console.log(`VocRT express server listening at http://localhost:${port}`);
|
35 |
});
|
backend/config.env
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
DEEPGRAM_KEY = <deepgram_api_key>
|
|
|
|
backend/config.js
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
require("dotenv").config({ path: "./config.env" });
|
2 |
-
|
3 |
-
const deepgram_key = process.env.DEEPGRAM_KEY;
|
4 |
-
|
5 |
-
module.exports = {
|
6 |
-
deepgram_key
|
7 |
-
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/controller/chat.js
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
const axios = require("axios");
|
2 |
+
|
3 |
+
const getChats = async (req, res) => {
|
4 |
+
try {
|
5 |
+
const response = await axios.get(`http://localhost:8082/get-chats`);
|
6 |
+
console.log(response.data);
|
7 |
+
if (!response.data) {
|
8 |
+
return res.status(200).json({
|
9 |
+
success: false,
|
10 |
+
message: "failed to get chats",
|
11 |
+
});
|
12 |
+
}
|
13 |
+
const chats = response.data;
|
14 |
+
const all_chats = chats.sort(
|
15 |
+
(a, b) => new Date(b.lastUpdatedAt) - new Date(a.lastUpdatedAt)
|
16 |
+
);
|
17 |
+
return res.status(200).json({
|
18 |
+
success: true,
|
19 |
+
message: "All chats",
|
20 |
+
chats: all_chats,
|
21 |
+
});
|
22 |
+
} catch (error) {
|
23 |
+
console.log(error);
|
24 |
+
return res
|
25 |
+
.status(500)
|
26 |
+
.json({ success: false, message: "Failed to get chat" });
|
27 |
+
}
|
28 |
+
};
|
29 |
+
const renameChats = async (req, res) => {
|
30 |
+
try {
|
31 |
+
const { sessionId, title } = req.body;
|
32 |
+
const response = await axios.post(`http://localhost:8082/rename-chat`, {
|
33 |
+
sessionId,
|
34 |
+
title,
|
35 |
+
});
|
36 |
+
console.log(response.data);
|
37 |
+
if (!response.data) {
|
38 |
+
return res.status(200).json({
|
39 |
+
success: false,
|
40 |
+
message: "failed to rename chats",
|
41 |
+
});
|
42 |
+
}
|
43 |
+
return res.status(200).json({
|
44 |
+
success: true,
|
45 |
+
message: "chat renamed!",
|
46 |
+
rename: { id: sessionId, title: title },
|
47 |
+
});
|
48 |
+
} catch (error) {
|
49 |
+
console.log(error);
|
50 |
+
return res
|
51 |
+
.status(500)
|
52 |
+
.json({ success: false, message: "Failed to get chat" });
|
53 |
+
}
|
54 |
+
};
|
55 |
+
|
56 |
+
const createChat = async (req, res) => {
|
57 |
+
try {
|
58 |
+
const { sessionId } = req.body;
|
59 |
+
console.log(sessionId);
|
60 |
+
const response = await axios.post(
|
61 |
+
`http://localhost:8082/create-chat/${sessionId}`
|
62 |
+
);
|
63 |
+
if (!response.data) {
|
64 |
+
return res
|
65 |
+
.status(500)
|
66 |
+
.json({ success: false, message: "Failed to create chat" });
|
67 |
+
}
|
68 |
+
return res.status(200).json({
|
69 |
+
success: true,
|
70 |
+
message: "Chat created successfully",
|
71 |
+
sessionId: sessionId,
|
72 |
+
});
|
73 |
+
} catch (error) {
|
74 |
+
console.log(error);
|
75 |
+
return res
|
76 |
+
.status(500)
|
77 |
+
.json({ success: false, message: "Failed to create chat" });
|
78 |
+
}
|
79 |
+
};
|
80 |
+
|
81 |
+
const deleteChat = async (req, res) => {
|
82 |
+
try {
|
83 |
+
const { sessionId } = req.body;
|
84 |
+
const response = await axios.post(`http://localhost:8082/delete-chat`, {
|
85 |
+
sessionId,
|
86 |
+
});
|
87 |
+
if (!response.data) {
|
88 |
+
return res
|
89 |
+
.status(500)
|
90 |
+
.json({ success: false, message: "Failed to delete chat" });
|
91 |
+
}
|
92 |
+
return res.status(200).json({
|
93 |
+
success: true,
|
94 |
+
message: "Chat deleted successfully",
|
95 |
+
sessionId: sessionId,
|
96 |
+
});
|
97 |
+
} catch (error) {
|
98 |
+
console.log(error);
|
99 |
+
return res
|
100 |
+
.status(500)
|
101 |
+
.json({ success: false, message: "Failed to delete chat" });
|
102 |
+
}
|
103 |
+
};
|
104 |
+
|
105 |
+
module.exports = { createChat, getChats, renameChats, deleteChat };
|
backend/controller/file.js
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
const axios = require("axios");
|
2 |
+
const FormData = require("form-data");
|
3 |
+
|
4 |
+
const uploadPDF = async (req, res) => {
|
5 |
+
try {
|
6 |
+
const file = req.file;
|
7 |
+
|
8 |
+
if (!file) {
|
9 |
+
return res
|
10 |
+
.status(400)
|
11 |
+
.json({ success: false, message: "No file uploaded" });
|
12 |
+
}
|
13 |
+
|
14 |
+
const { name, sessionId, title, summary, categories } = req.body;
|
15 |
+
|
16 |
+
const formData = new FormData();
|
17 |
+
formData.append("pdf_file", file.buffer, file.originalname);
|
18 |
+
formData.append("name", name);
|
19 |
+
formData.append("sessionId", sessionId);
|
20 |
+
formData.append("title", title);
|
21 |
+
formData.append("summary", summary);
|
22 |
+
formData.append("categories", categories);
|
23 |
+
|
24 |
+
const response = await axios.post(
|
25 |
+
"http://localhost:8082/upload-pdf",
|
26 |
+
formData,
|
27 |
+
{
|
28 |
+
headers: {
|
29 |
+
...formData.getHeaders(),
|
30 |
+
},
|
31 |
+
}
|
32 |
+
);
|
33 |
+
|
34 |
+
// fs.unlink(file.path, (err) => {
|
35 |
+
// if (err) console.error("Error deleting the file:", err);
|
36 |
+
// else console.log("File deleted successfully");
|
37 |
+
// });
|
38 |
+
|
39 |
+
return res.status(200).json({
|
40 |
+
success: true,
|
41 |
+
message: "PDF uploaded and processed successfully",
|
42 |
+
data: response.data,
|
43 |
+
});
|
44 |
+
} catch (error) {
|
45 |
+
console.warn("Error in uploadPDF:", error);
|
46 |
+
|
47 |
+
return res.status(500).json({
|
48 |
+
success: false,
|
49 |
+
message: "Failed to process PDF",
|
50 |
+
error: error.message,
|
51 |
+
});
|
52 |
+
}
|
53 |
+
};
|
54 |
+
|
55 |
+
const uploadLink = async (req, res) => {
|
56 |
+
try {
|
57 |
+
const { link, sessionId, title, summary, categories } = req.body;
|
58 |
+
const response = await axios.post("http://localhost:8082/process-link", {
|
59 |
+
link,
|
60 |
+
sessionId,
|
61 |
+
title,
|
62 |
+
summary,
|
63 |
+
categories,
|
64 |
+
});
|
65 |
+
|
66 |
+
console.info("Response : ", response.data);
|
67 |
+
|
68 |
+
if (response.data.status === "success") {
|
69 |
+
// If the FastAPI endpoint indicates success
|
70 |
+
return res.status(200).json({
|
71 |
+
success: true,
|
72 |
+
message: "Link processed successfully",
|
73 |
+
data: response.data, // Include any data if needed
|
74 |
+
});
|
75 |
+
} else {
|
76 |
+
// If the FastAPI endpoint indicates an error
|
77 |
+
return res.status(400).json({
|
78 |
+
success: false,
|
79 |
+
message:
|
80 |
+
response.data.detail || "Failed to process link. Please try again",
|
81 |
+
});
|
82 |
+
}
|
83 |
+
} catch (error) {
|
84 |
+
console.error("Error in uploading link : ", error);
|
85 |
+
|
86 |
+
if (error.response) {
|
87 |
+
const { status, data } = error.response;
|
88 |
+
return res.status(status).json({
|
89 |
+
success: false,
|
90 |
+
message: data.detail || "Failed to process link. Please try again",
|
91 |
+
});
|
92 |
+
} else if (error.request) {
|
93 |
+
console.error("No response received from the server:", error.request);
|
94 |
+
return res.status(500).json({
|
95 |
+
success: false,
|
96 |
+
message:
|
97 |
+
"No response received from the server. Please try again later.",
|
98 |
+
});
|
99 |
+
} else {
|
100 |
+
// Something happened in setting up the request that triggered an Error
|
101 |
+
console.error("Error in setting up the request:", error.message);
|
102 |
+
return res.status(500).json({
|
103 |
+
success: false,
|
104 |
+
message: "An unexpected error occurred. Please try again",
|
105 |
+
});
|
106 |
+
}
|
107 |
+
}
|
108 |
+
};
|
109 |
+
|
110 |
+
const uploadText = async (req, res) => {
|
111 |
+
try {
|
112 |
+
const { text, sessionId, name, title, summary, categories } = req.body;
|
113 |
+
const response = await axios.post("http://localhost:8082/process-text", {
|
114 |
+
text,
|
115 |
+
sessionId,
|
116 |
+
title,
|
117 |
+
name,
|
118 |
+
summary,
|
119 |
+
categories,
|
120 |
+
});
|
121 |
+
|
122 |
+
console.info("Response : ", response.data);
|
123 |
+
|
124 |
+
if (response.data.status === "success") {
|
125 |
+
// If the FastAPI endpoint indicates success
|
126 |
+
return res.status(200).json({
|
127 |
+
success: true,
|
128 |
+
message: "Link processed successfully",
|
129 |
+
data: response.data, // Include any data if needed
|
130 |
+
});
|
131 |
+
} else {
|
132 |
+
// If the FastAPI endpoint indicates an error
|
133 |
+
return res.status(400).json({
|
134 |
+
success: false,
|
135 |
+
message:
|
136 |
+
response.data.detail || "Failed to process link. Please try again",
|
137 |
+
});
|
138 |
+
}
|
139 |
+
} catch (error) {
|
140 |
+
console.error("Error in uploading link : ", error);
|
141 |
+
|
142 |
+
if (error.response) {
|
143 |
+
const { status, data } = error.response;
|
144 |
+
return res.status(status).json({
|
145 |
+
success: false,
|
146 |
+
message: data.detail || "Failed to process link. Please try again",
|
147 |
+
});
|
148 |
+
} else if (error.request) {
|
149 |
+
console.error("No response received from the server:", error.request);
|
150 |
+
return res.status(500).json({
|
151 |
+
success: false,
|
152 |
+
message:
|
153 |
+
"No response received from the server. Please try again later.",
|
154 |
+
});
|
155 |
+
} else {
|
156 |
+
// Something happened in setting up the request that triggered an Error
|
157 |
+
console.error("Error in setting up the request:", error.message);
|
158 |
+
return res.status(500).json({
|
159 |
+
success: false,
|
160 |
+
message: "An unexpected error occurred. Please try again",
|
161 |
+
});
|
162 |
+
}
|
163 |
+
}
|
164 |
+
};
|
165 |
+
|
166 |
+
const clearContext = async (req, res) => {
|
167 |
+
try {
|
168 |
+
const { sessionId } = req.body;
|
169 |
+
const response = await axios.post("http://localhost:8082/clear-context", {
|
170 |
+
sessionId,
|
171 |
+
});
|
172 |
+
const { data } = response;
|
173 |
+
if (data.status === "success") {
|
174 |
+
return res
|
175 |
+
.status(200)
|
176 |
+
.json({ success: true, message: "context cleared successfully" });
|
177 |
+
}
|
178 |
+
return res
|
179 |
+
.status(400)
|
180 |
+
.json({ message: "failed to clear context", success: false });
|
181 |
+
} catch (error) {
|
182 |
+
console.error("Error in clearing all context : ", error);
|
183 |
+
return res
|
184 |
+
.status(500)
|
185 |
+
.json({ message: "failed to clear context", success: false });
|
186 |
+
}
|
187 |
+
};
|
188 |
+
|
189 |
+
module.exports = { uploadPDF, uploadLink, clearContext, uploadText };
|
backend/controller/prompt.js
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
const axios = require("axios");
|
2 |
+
|
3 |
+
const savePrompt = async (req, res) => {
|
4 |
+
try {
|
5 |
+
const { sessionId, prompt } = req.body;
|
6 |
+
// console.log({ sessionId, prompt });
|
7 |
+
const response = await axios.post(`http://localhost:8082/save-prompt`, {
|
8 |
+
sessionId,
|
9 |
+
prompt,
|
10 |
+
});
|
11 |
+
console.log(response.data);
|
12 |
+
if (!response.data) {
|
13 |
+
return res.status(200).json({
|
14 |
+
success: false,
|
15 |
+
message: "failed to save prompt",
|
16 |
+
});
|
17 |
+
}
|
18 |
+
return res.status(200).json({
|
19 |
+
success: true,
|
20 |
+
message: "prompt saved!",
|
21 |
+
rename: { id: sessionId, prompt: prompt },
|
22 |
+
});
|
23 |
+
} catch (error) {
|
24 |
+
console.log(error);
|
25 |
+
return res
|
26 |
+
.status(500)
|
27 |
+
.json({ success: false, message: "Failed to save prompt" });
|
28 |
+
}
|
29 |
+
};
|
30 |
+
|
31 |
+
module.exports = { savePrompt };
|
backend/handle-realtime-tts/makegRPCconnection.js
CHANGED
@@ -5,7 +5,7 @@ const path = require("path");
|
|
5 |
const getgRPCConnection = (session) => {
|
6 |
return new Promise((resolve, reject) => {
|
7 |
protoLoader
|
8 |
-
.load(path.join(__dirname, "
|
9 |
keepCase: true,
|
10 |
longs: String,
|
11 |
enums: String,
|
@@ -13,7 +13,8 @@ const getgRPCConnection = (session) => {
|
|
13 |
oneofs: true,
|
14 |
})
|
15 |
.then((packageDefinition) => {
|
16 |
-
const textToSpeechProto =
|
|
|
17 |
const client = new textToSpeechProto.TextToSpeechService(
|
18 |
"localhost:8081",
|
19 |
grpc.credentials.createInsecure()
|
@@ -24,10 +25,8 @@ const getgRPCConnection = (session) => {
|
|
24 |
console.log("Made connection");
|
25 |
session.client = client;
|
26 |
|
27 |
-
|
28 |
const call = client.ProcessText();
|
29 |
resolve(call);
|
30 |
-
|
31 |
})
|
32 |
.catch((error) => {
|
33 |
session.client = null;
|
@@ -37,4 +36,4 @@ const getgRPCConnection = (session) => {
|
|
37 |
});
|
38 |
};
|
39 |
|
40 |
-
module.exports = { getgRPCConnection };
|
|
|
5 |
const getgRPCConnection = (session) => {
|
6 |
return new Promise((resolve, reject) => {
|
7 |
protoLoader
|
8 |
+
.load(path.join(__dirname, "text_to_speech_whisper.proto"), {
|
9 |
keepCase: true,
|
10 |
longs: String,
|
11 |
enums: String,
|
|
|
13 |
oneofs: true,
|
14 |
})
|
15 |
.then((packageDefinition) => {
|
16 |
+
const textToSpeechProto =
|
17 |
+
grpc.loadPackageDefinition(packageDefinition).texttospeech;
|
18 |
const client = new textToSpeechProto.TextToSpeechService(
|
19 |
"localhost:8081",
|
20 |
grpc.credentials.createInsecure()
|
|
|
25 |
console.log("Made connection");
|
26 |
session.client = client;
|
27 |
|
|
|
28 |
const call = client.ProcessText();
|
29 |
resolve(call);
|
|
|
30 |
})
|
31 |
.catch((error) => {
|
32 |
session.client = null;
|
|
|
36 |
});
|
37 |
};
|
38 |
|
39 |
+
module.exports = { getgRPCConnection };
|
backend/handle-realtime-tts/sttModelSocket.js
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
const isBuffer = require("is-buffer");
|
2 |
const { Buffer } = require("buffer");
|
3 |
-
const {deepgram_key} = require("../config");
|
4 |
const Session = require("../utils/session.js");
|
5 |
const { cleanupConnection } = require("./cleangRPCconnections.js");
|
6 |
const { getgRPCConnection } = require("./makegRPCconnection.js");
|
7 |
const { updateChathistory } = require("../providers/updateChathistory.js");
|
8 |
const { createClient, LiveTranscriptionEvents } = require("@deepgram/sdk");
|
9 |
-
const deepgram = createClient(deepgram_key);
|
10 |
|
11 |
const audio_stream = async (wss, req) => {
|
12 |
try {
|
@@ -14,15 +14,20 @@ const audio_stream = async (wss, req) => {
|
|
14 |
|
15 |
wss.send(JSON.stringify({ type: "initial", msg: "connected" }));
|
16 |
|
17 |
-
|
18 |
const connection = deepgram.listen.live({
|
19 |
punctuate: true,
|
20 |
interim_results: true,
|
21 |
speech_final: true,
|
22 |
encoding: "linear16",
|
23 |
sample_rate: 16000,
|
|
|
|
|
|
|
|
|
24 |
model: "nova-2",
|
25 |
-
|
|
|
|
|
26 |
version: "latest",
|
27 |
});
|
28 |
|
@@ -32,8 +37,7 @@ const audio_stream = async (wss, req) => {
|
|
32 |
} catch (error) {
|
33 |
console.error("Error in calling ml server : ", error);
|
34 |
}
|
35 |
-
}
|
36 |
-
|
37 |
|
38 |
connection.on(LiveTranscriptionEvents.Open, () => {
|
39 |
console.log(LiveTranscriptionEvents.Open);
|
@@ -59,7 +63,6 @@ const audio_stream = async (wss, req) => {
|
|
59 |
});
|
60 |
});
|
61 |
|
62 |
-
|
63 |
wss.on("message", async (message) => {
|
64 |
try {
|
65 |
if (isBuffer(message) && session.call) {
|
@@ -105,14 +108,34 @@ const audio_stream = async (wss, req) => {
|
|
105 |
|
106 |
const {
|
107 |
sessionId,
|
|
|
|
|
|
|
|
|
|
|
108 |
} = JSON.parse(msg);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
const metadata = {
|
110 |
metadata: {
|
111 |
-
session_id: sessionId,
|
|
|
|
|
|
|
|
|
|
|
112 |
},
|
113 |
};
|
|
|
|
|
114 |
if (session.call) {
|
115 |
-
console.log("Sending metadata.")
|
116 |
session.call.write(metadata);
|
117 |
}
|
118 |
} catch (err) {
|
@@ -122,7 +145,8 @@ const audio_stream = async (wss, req) => {
|
|
122 |
session.call.on("data", (response) => {
|
123 |
console.log("Data : ", response);
|
124 |
|
125 |
-
const {session_id
|
|
|
126 |
|
127 |
const metadata = JSON.stringify({
|
128 |
session_id: session_id,
|
@@ -134,12 +158,12 @@ const audio_stream = async (wss, req) => {
|
|
134 |
session.latency = Date.now();
|
135 |
wss.send(JSON.stringify({ type: "clear", msg: "clear" }));
|
136 |
session.chathistory = [...session.chathistorybackup];
|
137 |
-
wss.send(
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
);
|
143 |
const wavBuffer = Buffer.concat([
|
144 |
Buffer.from(metadata),
|
145 |
Buffer.from([0]),
|
@@ -156,8 +180,11 @@ const audio_stream = async (wss, req) => {
|
|
156 |
});
|
157 |
wss.send(
|
158 |
JSON.stringify({
|
159 |
-
type: "
|
160 |
-
msg:
|
|
|
|
|
|
|
161 |
})
|
162 |
);
|
163 |
session.chathistorybackup.push({
|
@@ -212,8 +239,11 @@ const audio_stream = async (wss, req) => {
|
|
212 |
|
213 |
wss.send(
|
214 |
JSON.stringify({
|
215 |
-
type: "
|
216 |
-
msg:
|
|
|
|
|
|
|
217 |
})
|
218 |
);
|
219 |
});
|
@@ -223,7 +253,7 @@ const audio_stream = async (wss, req) => {
|
|
223 |
await cleanupConnection(session);
|
224 |
try {
|
225 |
wss.send(JSON.stringify({ type: "end", msg: "end" }));
|
226 |
-
} catch (err) {
|
227 |
console.log("Stream ended");
|
228 |
});
|
229 |
|
@@ -231,7 +261,7 @@ const audio_stream = async (wss, req) => {
|
|
231 |
console.error(`Stream error: ${error}`);
|
232 |
try {
|
233 |
wss.send(JSON.stringify({ type: "end", msg: "end" }));
|
234 |
-
} catch (err) {
|
235 |
await cleanupConnection(session);
|
236 |
});
|
237 |
break;
|
@@ -240,7 +270,7 @@ const audio_stream = async (wss, req) => {
|
|
240 |
const { session_id, sequence_id, transcript } = msg;
|
241 |
const status = {
|
242 |
status: {
|
243 |
-
transcript
|
244 |
played_seq: sequence_id,
|
245 |
interrupt_seq: sequence_id,
|
246 |
},
|
@@ -280,9 +310,9 @@ const audio_stream = async (wss, req) => {
|
|
280 |
});
|
281 |
} catch (err) {
|
282 |
try {
|
283 |
-
console.log(err)
|
284 |
wss.send(JSON.stringify({ type: "end", msg: "end" }));
|
285 |
-
} catch (err) {
|
286 |
}
|
287 |
};
|
288 |
|
|
|
1 |
const isBuffer = require("is-buffer");
|
2 |
const { Buffer } = require("buffer");
|
3 |
+
// const { deepgram_key } = require("../config");
|
4 |
const Session = require("../utils/session.js");
|
5 |
const { cleanupConnection } = require("./cleangRPCconnections.js");
|
6 |
const { getgRPCConnection } = require("./makegRPCconnection.js");
|
7 |
const { updateChathistory } = require("../providers/updateChathistory.js");
|
8 |
const { createClient, LiveTranscriptionEvents } = require("@deepgram/sdk");
|
9 |
+
// const deepgram = createClient(deepgram_key);
|
10 |
|
11 |
const audio_stream = async (wss, req) => {
|
12 |
try {
|
|
|
14 |
|
15 |
wss.send(JSON.stringify({ type: "initial", msg: "connected" }));
|
16 |
|
|
|
17 |
const connection = deepgram.listen.live({
|
18 |
punctuate: true,
|
19 |
interim_results: true,
|
20 |
speech_final: true,
|
21 |
encoding: "linear16",
|
22 |
sample_rate: 16000,
|
23 |
+
// model: "nova-2-conversationalai",
|
24 |
+
// utterance_end_ms: "500",
|
25 |
+
endpointing: 500,
|
26 |
+
// model: "nova-2-phonecall",
|
27 |
model: "nova-2",
|
28 |
+
// model: "nova-2-general",
|
29 |
+
language: "en",
|
30 |
+
// language: "en-IN",
|
31 |
version: "latest",
|
32 |
});
|
33 |
|
|
|
37 |
} catch (error) {
|
38 |
console.error("Error in calling ml server : ", error);
|
39 |
}
|
40 |
+
};
|
|
|
41 |
|
42 |
connection.on(LiveTranscriptionEvents.Open, () => {
|
43 |
console.log(LiveTranscriptionEvents.Open);
|
|
|
63 |
});
|
64 |
});
|
65 |
|
|
|
66 |
wss.on("message", async (message) => {
|
67 |
try {
|
68 |
if (isBuffer(message) && session.call) {
|
|
|
108 |
|
109 |
const {
|
110 |
sessionId,
|
111 |
+
silenceDuration,
|
112 |
+
threshold,
|
113 |
+
temperature,
|
114 |
+
activeVoice,
|
115 |
+
maxTokens,
|
116 |
} = JSON.parse(msg);
|
117 |
+
console.log({
|
118 |
+
sessionId,
|
119 |
+
silenceDuration,
|
120 |
+
threshold,
|
121 |
+
temperature,
|
122 |
+
activeVoice,
|
123 |
+
maxTokens,
|
124 |
+
});
|
125 |
const metadata = {
|
126 |
metadata: {
|
127 |
+
session_id: String(sessionId),
|
128 |
+
silenceDuration: parseInt(silenceDuration, 10) || 100,
|
129 |
+
threshold: parseInt(threshold, 10) || 100,
|
130 |
+
temperature: parseFloat(temperature) || 0.7,
|
131 |
+
activeVoice: String(activeVoice),
|
132 |
+
maxTokens: parseInt(maxTokens, 10) || 500,
|
133 |
},
|
134 |
};
|
135 |
+
|
136 |
+
console.log(metadata);
|
137 |
if (session.call) {
|
138 |
+
console.log("Sending metadata.");
|
139 |
session.call.write(metadata);
|
140 |
}
|
141 |
} catch (err) {
|
|
|
145 |
session.call.on("data", (response) => {
|
146 |
console.log("Data : ", response);
|
147 |
|
148 |
+
const { session_id, sequence_id, transcript, buffer } =
|
149 |
+
response;
|
150 |
|
151 |
const metadata = JSON.stringify({
|
152 |
session_id: session_id,
|
|
|
158 |
session.latency = Date.now();
|
159 |
wss.send(JSON.stringify({ type: "clear", msg: "clear" }));
|
160 |
session.chathistory = [...session.chathistorybackup];
|
161 |
+
// wss.send(
|
162 |
+
// JSON.stringify({
|
163 |
+
// type: "chathistory",
|
164 |
+
// msg: session.chathistorybackup,
|
165 |
+
// })
|
166 |
+
// );
|
167 |
const wavBuffer = Buffer.concat([
|
168 |
Buffer.from(metadata),
|
169 |
Buffer.from([0]),
|
|
|
180 |
});
|
181 |
wss.send(
|
182 |
JSON.stringify({
|
183 |
+
type: "chat",
|
184 |
+
msg: {
|
185 |
+
role: "user",
|
186 |
+
content: transcript,
|
187 |
+
},
|
188 |
})
|
189 |
);
|
190 |
session.chathistorybackup.push({
|
|
|
239 |
|
240 |
wss.send(
|
241 |
JSON.stringify({
|
242 |
+
type: "chat",
|
243 |
+
msg: {
|
244 |
+
role: "ai",
|
245 |
+
content: transcript,
|
246 |
+
},
|
247 |
})
|
248 |
);
|
249 |
});
|
|
|
253 |
await cleanupConnection(session);
|
254 |
try {
|
255 |
wss.send(JSON.stringify({ type: "end", msg: "end" }));
|
256 |
+
} catch (err) {}
|
257 |
console.log("Stream ended");
|
258 |
});
|
259 |
|
|
|
261 |
console.error(`Stream error: ${error}`);
|
262 |
try {
|
263 |
wss.send(JSON.stringify({ type: "end", msg: "end" }));
|
264 |
+
} catch (err) {}
|
265 |
await cleanupConnection(session);
|
266 |
});
|
267 |
break;
|
|
|
270 |
const { session_id, sequence_id, transcript } = msg;
|
271 |
const status = {
|
272 |
status: {
|
273 |
+
transcript: transcript,
|
274 |
played_seq: sequence_id,
|
275 |
interrupt_seq: sequence_id,
|
276 |
},
|
|
|
310 |
});
|
311 |
} catch (err) {
|
312 |
try {
|
313 |
+
console.log(err);
|
314 |
wss.send(JSON.stringify({ type: "end", msg: "end" }));
|
315 |
+
} catch (err) {}
|
316 |
}
|
317 |
};
|
318 |
|
backend/handle-realtime-tts/sttModelSocket_whisper.js
ADDED
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
const isBuffer = require("is-buffer");
|
2 |
+
const { Buffer } = require("buffer");
|
3 |
+
const Session = require("../utils/session.js");
|
4 |
+
const { cleanupConnection } = require("./cleangRPCconnections.js");
|
5 |
+
const { getgRPCConnection } = require("./makegRPCconnection.js");
|
6 |
+
const { updateChathistory } = require("../providers/updateChathistory.js");
|
7 |
+
|
8 |
+
const audio_stream = async (wss, req) => {
|
9 |
+
try {
|
10 |
+
const session = new Session();
|
11 |
+
|
12 |
+
wss.send(JSON.stringify({ type: "initial", msg: "connected" }));
|
13 |
+
|
14 |
+
wss.on("message", async (message) => {
|
15 |
+
try {
|
16 |
+
if (isBuffer(message) && session.call) {
|
17 |
+
try {
|
18 |
+
const audio_message = {
|
19 |
+
audio_data: {
|
20 |
+
buffer: message,
|
21 |
+
},
|
22 |
+
};
|
23 |
+
|
24 |
+
try {
|
25 |
+
// Whisper
|
26 |
+
session.call.write(audio_message);
|
27 |
+
} catch (error) {
|
28 |
+
console.log("Error sending buffer to deepgram : ", error);
|
29 |
+
}
|
30 |
+
} catch (err) {
|
31 |
+
console.error("Error writing to stream: ", err);
|
32 |
+
}
|
33 |
+
}
|
34 |
+
|
35 |
+
if (typeof message === "string") {
|
36 |
+
try {
|
37 |
+
const data = JSON.parse(message);
|
38 |
+
|
39 |
+
const { type, msg } = data;
|
40 |
+
|
41 |
+
switch (type) {
|
42 |
+
case "start":
|
43 |
+
session.starttime = Date.now();
|
44 |
+
session.chathistory = [];
|
45 |
+
session.chathistorybackup = [];
|
46 |
+
console.log("Making Connection with gRPC...");
|
47 |
+
try {
|
48 |
+
console.time("grpcconnection");
|
49 |
+
session.call = await getgRPCConnection(session);
|
50 |
+
console.timeEnd("grpcconnection");
|
51 |
+
const state = session.channel.getConnectivityState(false);
|
52 |
+
console.log(`Client : ${state}`);
|
53 |
+
session.saved = false;
|
54 |
+
wss.send(JSON.stringify({ type: "ready", msg: "connected" }));
|
55 |
+
console.log("Connected to gRPC.");
|
56 |
+
|
57 |
+
const {
|
58 |
+
sessionId,
|
59 |
+
silenceDuration,
|
60 |
+
threshold,
|
61 |
+
temperature,
|
62 |
+
activeVoice,
|
63 |
+
maxTokens,
|
64 |
+
} = JSON.parse(msg);
|
65 |
+
console.log({
|
66 |
+
sessionId,
|
67 |
+
silenceDuration,
|
68 |
+
threshold,
|
69 |
+
temperature,
|
70 |
+
activeVoice,
|
71 |
+
maxTokens,
|
72 |
+
});
|
73 |
+
|
74 |
+
console.log(silenceDuration);
|
75 |
+
const metadata = {
|
76 |
+
metadata: {
|
77 |
+
session_id: String(sessionId),
|
78 |
+
silenceDuration: parseInt(silenceDuration * 1000) || 800,
|
79 |
+
threshold: parseInt(threshold, 10) || 100,
|
80 |
+
temperature: parseFloat(temperature, 10) || 0.7,
|
81 |
+
activeVoice: String(activeVoice),
|
82 |
+
maxTokens: parseInt(maxTokens, 10) || 500,
|
83 |
+
},
|
84 |
+
};
|
85 |
+
|
86 |
+
console.log(metadata);
|
87 |
+
if (session.call) {
|
88 |
+
console.log("Sending metadata.");
|
89 |
+
session.call.write(metadata);
|
90 |
+
}
|
91 |
+
} catch (err) {
|
92 |
+
await cleanupConnection(session);
|
93 |
+
console.error("Error in making gRPC Connection. : ", err);
|
94 |
+
}
|
95 |
+
session.call.on("data", (response) => {
|
96 |
+
const { session_id, sequence_id, transcript, buffer } =
|
97 |
+
response;
|
98 |
+
|
99 |
+
const metadata = JSON.stringify({
|
100 |
+
session_id: session_id,
|
101 |
+
sequence_id: sequence_id,
|
102 |
+
transcript: transcript,
|
103 |
+
});
|
104 |
+
|
105 |
+
if (sequence_id === "-2") {
|
106 |
+
session.latency = Date.now();
|
107 |
+
wss.send(JSON.stringify({ type: "clear", msg: "clear" }));
|
108 |
+
session.chathistory = [...session.chathistorybackup];
|
109 |
+
// wss.send(
|
110 |
+
// JSON.stringify({
|
111 |
+
// type: "chathistory",
|
112 |
+
// msg: session.chathistorybackup,
|
113 |
+
// })
|
114 |
+
// );
|
115 |
+
const wavBuffer = Buffer.concat([
|
116 |
+
Buffer.from(metadata),
|
117 |
+
Buffer.from([0]),
|
118 |
+
buffer,
|
119 |
+
]);
|
120 |
+
|
121 |
+
const base64buffer = wavBuffer.toString("base64");
|
122 |
+
wss.send(
|
123 |
+
JSON.stringify({ type: "media", msg: base64buffer })
|
124 |
+
);
|
125 |
+
session.chathistory.push({
|
126 |
+
speaker: "USER",
|
127 |
+
content: transcript,
|
128 |
+
});
|
129 |
+
wss.send(
|
130 |
+
JSON.stringify({
|
131 |
+
type: "chat",
|
132 |
+
msg: {
|
133 |
+
role: "user",
|
134 |
+
content: transcript,
|
135 |
+
},
|
136 |
+
})
|
137 |
+
);
|
138 |
+
session.chathistorybackup.push({
|
139 |
+
speaker: "USER",
|
140 |
+
content: transcript,
|
141 |
+
});
|
142 |
+
return;
|
143 |
+
}
|
144 |
+
|
145 |
+
if (sequence_id === "0") {
|
146 |
+
wss.send(JSON.stringify({ type: "pause", msg: "pause" }));
|
147 |
+
session.cansend = false;
|
148 |
+
return;
|
149 |
+
}
|
150 |
+
|
151 |
+
if (sequence_id === "-3") {
|
152 |
+
wss.send(
|
153 |
+
JSON.stringify({
|
154 |
+
type: "transcribing",
|
155 |
+
msg: "transcribing",
|
156 |
+
})
|
157 |
+
);
|
158 |
+
return;
|
159 |
+
}
|
160 |
+
if (sequence_id === "-5") {
|
161 |
+
wss.send(
|
162 |
+
JSON.stringify({
|
163 |
+
type: "stop_transcribing",
|
164 |
+
msg: "stop_transcribing",
|
165 |
+
})
|
166 |
+
);
|
167 |
+
return;
|
168 |
+
}
|
169 |
+
if (sequence_id === "-10") {
|
170 |
+
wss.send(
|
171 |
+
JSON.stringify({
|
172 |
+
type: "connected",
|
173 |
+
msg: "connected",
|
174 |
+
})
|
175 |
+
);
|
176 |
+
return;
|
177 |
+
}
|
178 |
+
if (sequence_id === "-4") {
|
179 |
+
wss.send(
|
180 |
+
JSON.stringify({ type: "thinking", msg: "thinking" })
|
181 |
+
);
|
182 |
+
return;
|
183 |
+
}
|
184 |
+
|
185 |
+
if (sequence_id === "-1") {
|
186 |
+
wss.send(
|
187 |
+
JSON.stringify({ type: "continue", msg: "continue" })
|
188 |
+
);
|
189 |
+
return;
|
190 |
+
}
|
191 |
+
|
192 |
+
if (sequence_id === "1") {
|
193 |
+
const latency = Date.now() - session.latency;
|
194 |
+
session.latency = 0;
|
195 |
+
// wss.send(JSON.stringify({ type: "clear", msg: "clear" }));
|
196 |
+
session.cansend = true;
|
197 |
+
}
|
198 |
+
|
199 |
+
if (!buffer) {
|
200 |
+
return;
|
201 |
+
}
|
202 |
+
|
203 |
+
if (!session.cansend && sequence_id !== "0") {
|
204 |
+
return;
|
205 |
+
}
|
206 |
+
|
207 |
+
// Combine header and PCM data into a single Buffer
|
208 |
+
const wavBuffer = Buffer.concat([
|
209 |
+
Buffer.from(metadata),
|
210 |
+
Buffer.from([0]),
|
211 |
+
buffer,
|
212 |
+
]);
|
213 |
+
|
214 |
+
const base64buffer = wavBuffer.toString("base64");
|
215 |
+
wss.send(
|
216 |
+
JSON.stringify({ type: "media", msg: base64buffer })
|
217 |
+
);
|
218 |
+
|
219 |
+
updateChathistory(transcript, false, session);
|
220 |
+
|
221 |
+
wss.send(
|
222 |
+
JSON.stringify({
|
223 |
+
type: "chat",
|
224 |
+
msg: {
|
225 |
+
role: "ai",
|
226 |
+
content: transcript,
|
227 |
+
},
|
228 |
+
})
|
229 |
+
);
|
230 |
+
});
|
231 |
+
|
232 |
+
session.call.on("end", async () => {
|
233 |
+
console.log("Ended");
|
234 |
+
await cleanupConnection(session);
|
235 |
+
try {
|
236 |
+
wss.send(JSON.stringify({ type: "end", msg: "end" }));
|
237 |
+
} catch (err) {}
|
238 |
+
console.log("Stream ended");
|
239 |
+
});
|
240 |
+
|
241 |
+
session.call.on("error", async (error) => {
|
242 |
+
console.error(`Stream error: ${error}`);
|
243 |
+
try {
|
244 |
+
wss.send(JSON.stringify({ type: "end", msg: "end" }));
|
245 |
+
} catch (err) {}
|
246 |
+
await cleanupConnection(session);
|
247 |
+
});
|
248 |
+
break;
|
249 |
+
|
250 |
+
case "status":
|
251 |
+
const { session_id, sequence_id, transcript } = msg;
|
252 |
+
const status = {
|
253 |
+
status: {
|
254 |
+
transcript: transcript,
|
255 |
+
played_seq: sequence_id,
|
256 |
+
interrupt_seq: sequence_id,
|
257 |
+
},
|
258 |
+
};
|
259 |
+
|
260 |
+
if (session.call) {
|
261 |
+
session.call.write(status);
|
262 |
+
}
|
263 |
+
|
264 |
+
updateChathistory(transcript, true, session);
|
265 |
+
break;
|
266 |
+
|
267 |
+
case "stop":
|
268 |
+
console.log("Client Stoped the stream.");
|
269 |
+
await cleanupConnection(session);
|
270 |
+
break;
|
271 |
+
default:
|
272 |
+
console.log("Type not handled.");
|
273 |
+
}
|
274 |
+
} catch (err) {
|
275 |
+
console.log(`Not a valid json : ${err}`);
|
276 |
+
}
|
277 |
+
}
|
278 |
+
} catch (err) {
|
279 |
+
console.error(`Error in wss.onmessage : ${err}`);
|
280 |
+
}
|
281 |
+
});
|
282 |
+
|
283 |
+
wss.on("close", async () => {
|
284 |
+
await cleanupConnection(session);
|
285 |
+
console.log("WebSocket connection closed.");
|
286 |
+
});
|
287 |
+
|
288 |
+
wss.on("error", async (err) => {
|
289 |
+
console.error(`WebSocket error: ${err}`);
|
290 |
+
await cleanupConnection(session);
|
291 |
+
});
|
292 |
+
} catch (err) {
|
293 |
+
try {
|
294 |
+
console.log(err);
|
295 |
+
wss.send(JSON.stringify({ type: "end", msg: "end" }));
|
296 |
+
} catch (err) {}
|
297 |
+
}
|
298 |
+
};
|
299 |
+
|
300 |
+
module.exports = { audio_stream };
|
backend/handle-realtime-tts/text_to_speech.proto
CHANGED
@@ -23,6 +23,11 @@ message ProcessTextResponse {
|
|
23 |
|
24 |
message Meta {
|
25 |
string session_id = 1;
|
|
|
|
|
|
|
|
|
|
|
26 |
}
|
27 |
|
28 |
message Status {
|
|
|
23 |
|
24 |
message Meta {
|
25 |
string session_id = 1;
|
26 |
+
int32 silenceDuration = 2;
|
27 |
+
int32 threshold = 3;
|
28 |
+
float temperature = 4;
|
29 |
+
string activeVoice = 5;
|
30 |
+
int32 maxTokens = 6;
|
31 |
}
|
32 |
|
33 |
message Status {
|
backend/handle-realtime-tts/text_to_speech_whisper.proto
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
syntax = "proto3";
|
2 |
+
|
3 |
+
package texttospeech;
|
4 |
+
|
5 |
+
service TextToSpeechService {
|
6 |
+
rpc ProcessText (stream ProcessTextRequest) returns (stream ProcessTextResponse);
|
7 |
+
}
|
8 |
+
|
9 |
+
message ProcessTextRequest {
|
10 |
+
oneof request_data {
|
11 |
+
Audio audio_data = 1;
|
12 |
+
Meta metadata = 2;
|
13 |
+
Status status = 3;
|
14 |
+
}
|
15 |
+
}
|
16 |
+
|
17 |
+
message ProcessTextResponse {
|
18 |
+
bytes buffer = 1;
|
19 |
+
string session_id = 2;
|
20 |
+
string sequence_id = 3;
|
21 |
+
string transcript = 4;
|
22 |
+
}
|
23 |
+
|
24 |
+
message Meta {
|
25 |
+
string session_id = 1;
|
26 |
+
int32 silenceDuration = 2;
|
27 |
+
int32 threshold = 3;
|
28 |
+
float temperature = 4;
|
29 |
+
string activeVoice = 5;
|
30 |
+
int32 maxTokens = 6;
|
31 |
+
}
|
32 |
+
|
33 |
+
message Status {
|
34 |
+
string transcript = 1;
|
35 |
+
string played_seq = 2;
|
36 |
+
string interrupt_seq = 3;
|
37 |
+
}
|
38 |
+
|
39 |
+
message Audio {
|
40 |
+
bytes buffer = 1;
|
41 |
+
}
|
backend/package-lock.json
CHANGED
@@ -12,7 +12,7 @@
|
|
12 |
"@deepgram/sdk": "^3.9.0",
|
13 |
"@geckos.io/server": "^3.0.0",
|
14 |
"@grpc/grpc-js": "^1.11.3",
|
15 |
-
"axios": "^1.
|
16 |
"bcryptjs": "^2.4.3",
|
17 |
"cors": "^2.8.5",
|
18 |
"crypto": "^1.0.1",
|
@@ -21,7 +21,8 @@
|
|
21 |
"express-ws": "^5.0.2",
|
22 |
"is-buffer": "^2.0.5",
|
23 |
"jsonwebtoken": "^9.0.2",
|
24 |
-
"module": "^1.2.5"
|
|
|
25 |
}
|
26 |
},
|
27 |
"node_modules/@deepgram/captions": {
|
@@ -276,6 +277,11 @@
|
|
276 |
"url": "https://github.com/chalk/ansi-styles?sponsor=1"
|
277 |
}
|
278 |
},
|
|
|
|
|
|
|
|
|
|
|
279 |
"node_modules/arr-diff": {
|
280 |
"version": "2.0.0",
|
281 |
"resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-2.0.0.tgz",
|
@@ -325,9 +331,9 @@
|
|
325 |
}
|
326 |
},
|
327 |
"node_modules/axios": {
|
328 |
-
"version": "1.
|
329 |
-
"resolved": "https://registry.npmjs.org/axios/-/axios-1.
|
330 |
-
"integrity": "sha512-
|
331 |
"dependencies": {
|
332 |
"follow-redirects": "^1.15.6",
|
333 |
"form-data": "^4.0.0",
|
@@ -446,6 +452,22 @@
|
|
446 |
"resolved": "https://registry.npmjs.org/buffer-equal-constant-time/-/buffer-equal-constant-time-1.0.1.tgz",
|
447 |
"integrity": "sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA=="
|
448 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
449 |
"node_modules/bytes": {
|
450 |
"version": "3.1.2",
|
451 |
"resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
|
@@ -2301,6 +2323,74 @@
|
|
2301 |
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
|
2302 |
"integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A=="
|
2303 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2304 |
"node_modules/napi-build-utils": {
|
2305 |
"version": "1.0.2",
|
2306 |
"resolved": "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-1.0.2.tgz",
|
@@ -3195,6 +3285,14 @@
|
|
3195 |
"resolved": "https://registry.npmjs.org/stream-shift/-/stream-shift-1.0.3.tgz",
|
3196 |
"integrity": "sha512-76ORR0DO1o1hlKwTbi/DM3EXWGf3ZJYO8cXX5RJwnul2DEg2oyoZyjLNoQM8WsvZiFKCRfC1O0J7iCvie3RZmQ=="
|
3197 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3198 |
"node_modules/string_decoder": {
|
3199 |
"version": "1.3.0",
|
3200 |
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz",
|
|
|
12 |
"@deepgram/sdk": "^3.9.0",
|
13 |
"@geckos.io/server": "^3.0.0",
|
14 |
"@grpc/grpc-js": "^1.11.3",
|
15 |
+
"axios": "^1.8.4",
|
16 |
"bcryptjs": "^2.4.3",
|
17 |
"cors": "^2.8.5",
|
18 |
"crypto": "^1.0.1",
|
|
|
21 |
"express-ws": "^5.0.2",
|
22 |
"is-buffer": "^2.0.5",
|
23 |
"jsonwebtoken": "^9.0.2",
|
24 |
+
"module": "^1.2.5",
|
25 |
+
"multer": "^1.4.5-lts.2"
|
26 |
}
|
27 |
},
|
28 |
"node_modules/@deepgram/captions": {
|
|
|
277 |
"url": "https://github.com/chalk/ansi-styles?sponsor=1"
|
278 |
}
|
279 |
},
|
280 |
+
"node_modules/append-field": {
|
281 |
+
"version": "1.0.0",
|
282 |
+
"resolved": "https://registry.npmjs.org/append-field/-/append-field-1.0.0.tgz",
|
283 |
+
"integrity": "sha512-klpgFSWLW1ZEs8svjfb7g4qWY0YS5imI82dTg+QahUvJ8YqAY0P10Uk8tTyh9ZGuYEZEMaeJYCF5BFuX552hsw=="
|
284 |
+
},
|
285 |
"node_modules/arr-diff": {
|
286 |
"version": "2.0.0",
|
287 |
"resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-2.0.0.tgz",
|
|
|
331 |
}
|
332 |
},
|
333 |
"node_modules/axios": {
|
334 |
+
"version": "1.8.4",
|
335 |
+
"resolved": "https://registry.npmjs.org/axios/-/axios-1.8.4.tgz",
|
336 |
+
"integrity": "sha512-eBSYY4Y68NNlHbHBMdeDmKNtDgXWhQsJcGqzO3iLUM0GraQFSS9cVgPX5I9b3lbdFKyYoAEGAZF1DwhTaljNAw==",
|
337 |
"dependencies": {
|
338 |
"follow-redirects": "^1.15.6",
|
339 |
"form-data": "^4.0.0",
|
|
|
452 |
"resolved": "https://registry.npmjs.org/buffer-equal-constant-time/-/buffer-equal-constant-time-1.0.1.tgz",
|
453 |
"integrity": "sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA=="
|
454 |
},
|
455 |
+
"node_modules/buffer-from": {
|
456 |
+
"version": "1.1.2",
|
457 |
+
"resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz",
|
458 |
+
"integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ=="
|
459 |
+
},
|
460 |
+
"node_modules/busboy": {
|
461 |
+
"version": "1.6.0",
|
462 |
+
"resolved": "https://registry.npmjs.org/busboy/-/busboy-1.6.0.tgz",
|
463 |
+
"integrity": "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==",
|
464 |
+
"dependencies": {
|
465 |
+
"streamsearch": "^1.1.0"
|
466 |
+
},
|
467 |
+
"engines": {
|
468 |
+
"node": ">=10.16.0"
|
469 |
+
}
|
470 |
+
},
|
471 |
"node_modules/bytes": {
|
472 |
"version": "3.1.2",
|
473 |
"resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
|
|
|
2323 |
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
|
2324 |
"integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A=="
|
2325 |
},
|
2326 |
+
"node_modules/multer": {
|
2327 |
+
"version": "1.4.5-lts.2",
|
2328 |
+
"resolved": "https://registry.npmjs.org/multer/-/multer-1.4.5-lts.2.tgz",
|
2329 |
+
"integrity": "sha512-VzGiVigcG9zUAoCNU+xShztrlr1auZOlurXynNvO9GiWD1/mTBbUljOKY+qMeazBqXgRnjzeEgJI/wyjJUHg9A==",
|
2330 |
+
"dependencies": {
|
2331 |
+
"append-field": "^1.0.0",
|
2332 |
+
"busboy": "^1.0.0",
|
2333 |
+
"concat-stream": "^1.5.2",
|
2334 |
+
"mkdirp": "^0.5.4",
|
2335 |
+
"object-assign": "^4.1.1",
|
2336 |
+
"type-is": "^1.6.4",
|
2337 |
+
"xtend": "^4.0.0"
|
2338 |
+
},
|
2339 |
+
"engines": {
|
2340 |
+
"node": ">= 6.0.0"
|
2341 |
+
}
|
2342 |
+
},
|
2343 |
+
"node_modules/multer/node_modules/concat-stream": {
|
2344 |
+
"version": "1.6.2",
|
2345 |
+
"resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
|
2346 |
+
"integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==",
|
2347 |
+
"engines": [
|
2348 |
+
"node >= 0.8"
|
2349 |
+
],
|
2350 |
+
"dependencies": {
|
2351 |
+
"buffer-from": "^1.0.0",
|
2352 |
+
"inherits": "^2.0.3",
|
2353 |
+
"readable-stream": "^2.2.2",
|
2354 |
+
"typedarray": "^0.0.6"
|
2355 |
+
}
|
2356 |
+
},
|
2357 |
+
"node_modules/multer/node_modules/process-nextick-args": {
|
2358 |
+
"version": "2.0.1",
|
2359 |
+
"resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz",
|
2360 |
+
"integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag=="
|
2361 |
+
},
|
2362 |
+
"node_modules/multer/node_modules/readable-stream": {
|
2363 |
+
"version": "2.3.8",
|
2364 |
+
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz",
|
2365 |
+
"integrity": "sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==",
|
2366 |
+
"dependencies": {
|
2367 |
+
"core-util-is": "~1.0.0",
|
2368 |
+
"inherits": "~2.0.3",
|
2369 |
+
"isarray": "~1.0.0",
|
2370 |
+
"process-nextick-args": "~2.0.0",
|
2371 |
+
"safe-buffer": "~5.1.1",
|
2372 |
+
"string_decoder": "~1.1.1",
|
2373 |
+
"util-deprecate": "~1.0.1"
|
2374 |
+
}
|
2375 |
+
},
|
2376 |
+
"node_modules/multer/node_modules/safe-buffer": {
|
2377 |
+
"version": "5.1.2",
|
2378 |
+
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
|
2379 |
+
"integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g=="
|
2380 |
+
},
|
2381 |
+
"node_modules/multer/node_modules/string_decoder": {
|
2382 |
+
"version": "1.1.1",
|
2383 |
+
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
|
2384 |
+
"integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
|
2385 |
+
"dependencies": {
|
2386 |
+
"safe-buffer": "~5.1.0"
|
2387 |
+
}
|
2388 |
+
},
|
2389 |
+
"node_modules/multer/node_modules/typedarray": {
|
2390 |
+
"version": "0.0.6",
|
2391 |
+
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
|
2392 |
+
"integrity": "sha512-/aCDEGatGvZ2BIk+HmLf4ifCJFwvKFNb9/JeZPMulfgFracn9QFcAf5GO8B/mweUjSoblS5In0cWhqpfs/5PQA=="
|
2393 |
+
},
|
2394 |
"node_modules/napi-build-utils": {
|
2395 |
"version": "1.0.2",
|
2396 |
"resolved": "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-1.0.2.tgz",
|
|
|
3285 |
"resolved": "https://registry.npmjs.org/stream-shift/-/stream-shift-1.0.3.tgz",
|
3286 |
"integrity": "sha512-76ORR0DO1o1hlKwTbi/DM3EXWGf3ZJYO8cXX5RJwnul2DEg2oyoZyjLNoQM8WsvZiFKCRfC1O0J7iCvie3RZmQ=="
|
3287 |
},
|
3288 |
+
"node_modules/streamsearch": {
|
3289 |
+
"version": "1.1.0",
|
3290 |
+
"resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-1.1.0.tgz",
|
3291 |
+
"integrity": "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg==",
|
3292 |
+
"engines": {
|
3293 |
+
"node": ">=10.0.0"
|
3294 |
+
}
|
3295 |
+
},
|
3296 |
"node_modules/string_decoder": {
|
3297 |
"version": "1.3.0",
|
3298 |
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz",
|
backend/package.json
CHANGED
@@ -10,10 +10,9 @@
|
|
10 |
"author": "",
|
11 |
"license": "ISC",
|
12 |
"dependencies": {
|
13 |
-
"@deepgram/sdk": "^3.9.0",
|
14 |
"@geckos.io/server": "^3.0.0",
|
15 |
"@grpc/grpc-js": "^1.11.3",
|
16 |
-
"axios": "^1.
|
17 |
"bcryptjs": "^2.4.3",
|
18 |
"cors": "^2.8.5",
|
19 |
"crypto": "^1.0.1",
|
@@ -22,6 +21,7 @@
|
|
22 |
"express-ws": "^5.0.2",
|
23 |
"is-buffer": "^2.0.5",
|
24 |
"jsonwebtoken": "^9.0.2",
|
25 |
-
"module": "^1.2.5"
|
|
|
26 |
}
|
27 |
}
|
|
|
10 |
"author": "",
|
11 |
"license": "ISC",
|
12 |
"dependencies": {
|
|
|
13 |
"@geckos.io/server": "^3.0.0",
|
14 |
"@grpc/grpc-js": "^1.11.3",
|
15 |
+
"axios": "^1.8.4",
|
16 |
"bcryptjs": "^2.4.3",
|
17 |
"cors": "^2.8.5",
|
18 |
"crypto": "^1.0.1",
|
|
|
21 |
"express-ws": "^5.0.2",
|
22 |
"is-buffer": "^2.0.5",
|
23 |
"jsonwebtoken": "^9.0.2",
|
24 |
+
"module": "^1.2.5",
|
25 |
+
"multer": "^1.4.5-lts.2"
|
26 |
}
|
27 |
}
|
backend/routes/chat.routes.js
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
const express = require("express");
|
2 |
+
const {
|
3 |
+
createChat,
|
4 |
+
getChats,
|
5 |
+
renameChats,
|
6 |
+
deleteChat,
|
7 |
+
} = require("../controller/chat");
|
8 |
+
const chatRouter = express.Router();
|
9 |
+
|
10 |
+
chatRouter.post("/create-chat", createChat);
|
11 |
+
chatRouter.post("/get-chats", getChats);
|
12 |
+
chatRouter.post("/rename-chat", renameChats);
|
13 |
+
chatRouter.post("/delete-chat", deleteChat);
|
14 |
+
|
15 |
+
module.exports = chatRouter;
|
backend/routes/prompt.routes.js
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
const express = require("express");
|
2 |
+
const promptRouter = express.Router();
|
3 |
+
const { savePrompt } = require("../controller/prompt");
|
4 |
+
|
5 |
+
promptRouter.post("/", savePrompt);
|
6 |
+
|
7 |
+
module.exports = promptRouter;
|
backend/routes/rag.routes.js
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
const express = require("express");
|
2 |
+
const {
|
3 |
+
uploadPDF,
|
4 |
+
uploadLink,
|
5 |
+
clearContext,
|
6 |
+
uploadText,
|
7 |
+
} = require("../controller/file");
|
8 |
+
const multer = require("multer");
|
9 |
+
const path = require("path");
|
10 |
+
const ragRouter = express.Router();
|
11 |
+
|
12 |
+
// const storage = multer.diskStorage({
|
13 |
+
// destination: function (req, file, cb) {
|
14 |
+
// cb(null, "uploads/");
|
15 |
+
// },
|
16 |
+
// filename: function (req, file, cb) {
|
17 |
+
// cb(null, Date.now() + "-" + file.originalname);
|
18 |
+
// },
|
19 |
+
// });
|
20 |
+
|
21 |
+
const storage = multer.memoryStorage();
|
22 |
+
|
23 |
+
const fileFilter = function (req, file, cb) {
|
24 |
+
const allowedExt = /\.(pdf|csv|ppt|pptx|doc|docx|xls|xlsx|txt)$/i;
|
25 |
+
|
26 |
+
// Allowed MIME types
|
27 |
+
const allowedMime = [
|
28 |
+
"application/pdf",
|
29 |
+
"text/csv",
|
30 |
+
"application/vnd.ms-powerpoint",
|
31 |
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
32 |
+
"application/msword",
|
33 |
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
34 |
+
"application/vnd.ms-excel",
|
35 |
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
36 |
+
"text/plain",
|
37 |
+
];
|
38 |
+
|
39 |
+
// Check extension
|
40 |
+
const extname = allowedExt.test(path.extname(file.originalname));
|
41 |
+
// Check mime
|
42 |
+
const mimetype = allowedMime.includes(file.mimetype);
|
43 |
+
|
44 |
+
if (extname && mimetype) {
|
45 |
+
cb(null, true);
|
46 |
+
} else {
|
47 |
+
cb(
|
48 |
+
new Error(
|
49 |
+
"Invalid file type. Only document files are allowed: PDF, CSV, PPT(X), DOC(X), XLS(X), TXT."
|
50 |
+
),
|
51 |
+
false
|
52 |
+
);
|
53 |
+
}
|
54 |
+
};
|
55 |
+
|
56 |
+
const upload = multer({
|
57 |
+
storage: storage,
|
58 |
+
fileFilter: fileFilter,
|
59 |
+
});
|
60 |
+
|
61 |
+
ragRouter.post("/pdf", upload.single("pdfFile"), uploadPDF);
|
62 |
+
ragRouter.post("/link", uploadLink);
|
63 |
+
ragRouter.post("/text", uploadText);
|
64 |
+
ragRouter.post("/clear-context", clearContext);
|
65 |
+
|
66 |
+
module.exports = ragRouter;
|
chat_database.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import pickle
|
2 |
import os
|
|
|
3 |
|
4 |
FILE_PATH = "chat_history.pkl"
|
5 |
|
@@ -7,28 +8,107 @@ if not os.path.exists(FILE_PATH):
|
|
7 |
with open(FILE_PATH, "wb") as file:
|
8 |
pickle.dump({}, file)
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
def save_chat_entry(session_id, role, transcript):
|
11 |
try:
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
if session_id not in data:
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
if role == "user":
|
19 |
-
|
20 |
"role": role,
|
21 |
"transcript": transcript
|
22 |
})
|
23 |
else:
|
24 |
-
if
|
25 |
-
|
26 |
else:
|
27 |
-
|
28 |
"role": role,
|
29 |
"transcript": transcript
|
30 |
})
|
31 |
|
|
|
|
|
32 |
with open(FILE_PATH, "wb") as file:
|
33 |
pickle.dump(data, file)
|
34 |
|
@@ -36,29 +116,191 @@ def save_chat_entry(session_id, role, transcript):
|
|
36 |
print(f"Error saving chat entry: {e}")
|
37 |
|
38 |
|
39 |
-
def get_chat_history(session_id):
|
40 |
try:
|
41 |
with open(FILE_PATH, "rb") as file:
|
42 |
data = pickle.load(file)
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
if not chat_history:
|
47 |
return []
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
55 |
|
56 |
-
return message_history
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
except (FileNotFoundError, pickle.UnpicklingError) as e:
|
59 |
print(f"Error reading or parsing the file: {e}")
|
60 |
-
return []
|
61 |
except Exception as e:
|
62 |
print(f"Unexpected error: {e}")
|
63 |
-
return []
|
|
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import pickle
|
2 |
import os
|
3 |
+
from datetime import datetime, timezone
|
4 |
|
5 |
FILE_PATH = "chat_history.pkl"
|
6 |
|
|
|
8 |
with open(FILE_PATH, "wb") as file:
|
9 |
pickle.dump({}, file)
|
10 |
|
11 |
+
|
12 |
+
async def save_context_detail(session_id, name, title, summary, categories):
|
13 |
+
try:
|
14 |
+
try:
|
15 |
+
with open(FILE_PATH, "rb") as file:
|
16 |
+
data = pickle.load(file)
|
17 |
+
except (FileNotFoundError, EOFError):
|
18 |
+
data = {}
|
19 |
+
|
20 |
+
now = datetime.now(timezone.utc).isoformat()
|
21 |
+
|
22 |
+
if session_id not in data:
|
23 |
+
print("Session id not in data")
|
24 |
+
data[session_id] = {
|
25 |
+
"title": "New Chat",
|
26 |
+
"createdAt": now,
|
27 |
+
"lastUpdatedAt": now,
|
28 |
+
"chat": [],
|
29 |
+
"context": [],
|
30 |
+
"prompt": "",
|
31 |
+
}
|
32 |
+
|
33 |
+
session = data.get(session_id)
|
34 |
+
contexts = session.get("context", [])
|
35 |
+
|
36 |
+
contexts.append({"name": name, "title": title,
|
37 |
+
"summary": summary, "categories": categories})
|
38 |
+
|
39 |
+
data[session_id]["lastUpdatedAt"] = now
|
40 |
+
|
41 |
+
with open(FILE_PATH, "wb") as file:
|
42 |
+
pickle.dump(data, file)
|
43 |
+
|
44 |
+
except Exception as e:
|
45 |
+
print(f"Error saving context entry: {e}")
|
46 |
+
|
47 |
+
|
48 |
+
def clear_context_detail(session_id):
|
49 |
+
try:
|
50 |
+
try:
|
51 |
+
with open(FILE_PATH, "rb") as file:
|
52 |
+
data = pickle.load(file)
|
53 |
+
except (FileNotFoundError, EOFError):
|
54 |
+
data = {}
|
55 |
+
|
56 |
+
now = datetime.now(timezone.utc).isoformat()
|
57 |
+
|
58 |
+
if session_id not in data:
|
59 |
+
print("Session id not in data")
|
60 |
+
return False
|
61 |
+
|
62 |
+
data[session_id]["context"] = []
|
63 |
+
|
64 |
+
data[session_id]["lastUpdatedAt"] = now
|
65 |
+
|
66 |
+
with open(FILE_PATH, "wb") as file:
|
67 |
+
pickle.dump(data, file)
|
68 |
+
|
69 |
+
except Exception as e:
|
70 |
+
print(f"Error saving context entry: {e}")
|
71 |
+
|
72 |
+
|
73 |
def save_chat_entry(session_id, role, transcript):
|
74 |
try:
|
75 |
+
try:
|
76 |
+
with open(FILE_PATH, "rb") as file:
|
77 |
+
data = pickle.load(file)
|
78 |
+
except (FileNotFoundError, EOFError):
|
79 |
+
data = {}
|
80 |
+
|
81 |
+
now = datetime.now(timezone.utc).isoformat()
|
82 |
|
83 |
if session_id not in data:
|
84 |
+
print("Session id not in data")
|
85 |
+
data[session_id] = {
|
86 |
+
"title": "New Chat",
|
87 |
+
"createdAt": now,
|
88 |
+
"lastUpdatedAt": now,
|
89 |
+
"chat": [],
|
90 |
+
"context": [],
|
91 |
+
"prompt": "",
|
92 |
+
}
|
93 |
+
|
94 |
+
messages = data[session_id]["chat"]
|
95 |
|
96 |
if role == "user":
|
97 |
+
messages.append({
|
98 |
"role": role,
|
99 |
"transcript": transcript
|
100 |
})
|
101 |
else:
|
102 |
+
if messages and messages[-1]["role"] == "assistant":
|
103 |
+
messages[-1]["transcript"] += " " + transcript
|
104 |
else:
|
105 |
+
messages.append({
|
106 |
"role": role,
|
107 |
"transcript": transcript
|
108 |
})
|
109 |
|
110 |
+
data[session_id]["lastUpdatedAt"] = now
|
111 |
+
|
112 |
with open(FILE_PATH, "wb") as file:
|
113 |
pickle.dump(data, file)
|
114 |
|
|
|
116 |
print(f"Error saving chat entry: {e}")
|
117 |
|
118 |
|
119 |
+
def get_chat_history(session_id, limit=15):
|
120 |
try:
|
121 |
with open(FILE_PATH, "rb") as file:
|
122 |
data = pickle.load(file)
|
123 |
|
124 |
+
session = data.get(session_id)
|
125 |
+
if not session or not isinstance(session, dict):
|
|
|
126 |
return []
|
127 |
|
128 |
+
# or "messages" if you’ve standardized on that
|
129 |
+
# messages = session.get("chat", [])
|
130 |
+
|
131 |
+
# message_history = []
|
132 |
+
# for entry in messages:
|
133 |
+
# role = entry.get('role', '')
|
134 |
+
# transcript = entry.get('transcript', '')
|
135 |
+
# if role and transcript:
|
136 |
+
# message_history.append({"role": role, "content": transcript})
|
137 |
|
138 |
+
# return message_history[-15:]
|
139 |
+
|
140 |
+
tail = session.get("chat", [])[-limit:]
|
141 |
+
chat_history = [
|
142 |
+
{"role": msg["role"], "content": msg["transcript"]}
|
143 |
+
for msg in tail
|
144 |
+
if msg.get("role") and msg.get("transcript")
|
145 |
+
]
|
146 |
+
|
147 |
+
user_prompt = session.get("prompt", "")
|
148 |
+
return chat_history, user_prompt
|
149 |
|
150 |
except (FileNotFoundError, pickle.UnpicklingError) as e:
|
151 |
print(f"Error reading or parsing the file: {e}")
|
152 |
+
return []
|
153 |
except Exception as e:
|
154 |
print(f"Unexpected error: {e}")
|
155 |
+
return []
|
156 |
+
|
157 |
|
158 |
+
def get_all_chat_details():
|
159 |
+
try:
|
160 |
+
with open(FILE_PATH, "rb") as file:
|
161 |
+
data = pickle.load(file)
|
162 |
+
|
163 |
+
chat_list = []
|
164 |
+
|
165 |
+
for session_id, chat in data.items():
|
166 |
+
if not isinstance(chat, dict):
|
167 |
+
continue
|
168 |
+
|
169 |
+
messages = []
|
170 |
+
for entry in chat.get("chat", []):
|
171 |
+
role = entry.get("role", "")
|
172 |
+
transcript = entry.get("transcript", "")
|
173 |
+
if role and transcript:
|
174 |
+
messages.append({
|
175 |
+
"role": role,
|
176 |
+
"content": transcript
|
177 |
+
})
|
178 |
+
|
179 |
+
chat_list.append({
|
180 |
+
"id": session_id,
|
181 |
+
"title": chat.get("title", "Untitled"),
|
182 |
+
"createdAt": chat.get("createdAt"),
|
183 |
+
"lastUpdatedAt": chat.get("lastUpdatedAt"),
|
184 |
+
"chat": messages,
|
185 |
+
"context": chat.get("context", []),
|
186 |
+
"prompt": chat.get("prompt", ""),
|
187 |
+
})
|
188 |
+
|
189 |
+
return chat_list
|
190 |
+
|
191 |
+
except (FileNotFoundError, EOFError):
|
192 |
+
return []
|
193 |
+
|
194 |
+
except Exception as e:
|
195 |
+
print(f"Error reading chats: {e}")
|
196 |
+
return []
|
197 |
+
|
198 |
+
|
199 |
+
def create_chat_entry(session_id):
|
200 |
+
try:
|
201 |
+
# Load existing data or initialize an empty dict
|
202 |
+
try:
|
203 |
+
with open(FILE_PATH, "rb") as file:
|
204 |
+
data = pickle.load(file)
|
205 |
+
except (FileNotFoundError, EOFError):
|
206 |
+
data = {}
|
207 |
+
|
208 |
+
now = datetime.now(timezone.utc).isoformat()
|
209 |
+
|
210 |
+
if session_id not in data:
|
211 |
+
data[session_id] = {
|
212 |
+
"title": "New Chat",
|
213 |
+
"createdAt": now,
|
214 |
+
"lastUpdatedAt": now,
|
215 |
+
"chat": [],
|
216 |
+
"context": [],
|
217 |
+
}
|
218 |
+
|
219 |
+
# Save the updated data back to file
|
220 |
+
with open(FILE_PATH, "wb") as file:
|
221 |
+
pickle.dump(data, file)
|
222 |
+
|
223 |
+
return True
|
224 |
+
|
225 |
+
except Exception as e:
|
226 |
+
print(f"Error create chat entry : {e}")
|
227 |
+
return False
|
228 |
+
|
229 |
+
|
230 |
+
def rename_chat_title(session_id, title):
|
231 |
+
try:
|
232 |
+
try:
|
233 |
+
with open(FILE_PATH, "rb") as file:
|
234 |
+
data = pickle.load(file)
|
235 |
+
except (FileNotFoundError, EOFError):
|
236 |
+
data = {}
|
237 |
+
|
238 |
+
if session_id not in data:
|
239 |
+
return False
|
240 |
+
|
241 |
+
data[session_id]["title"] = title
|
242 |
+
data[session_id]["lastUpdatedAt"] = datetime.now(
|
243 |
+
timezone.utc).isoformat()
|
244 |
+
|
245 |
+
with open(FILE_PATH, "wb") as file:
|
246 |
+
pickle.dump(data, file)
|
247 |
+
|
248 |
+
print(f"Renamed chat: {data[session_id]}")
|
249 |
+
return True
|
250 |
+
|
251 |
+
except Exception as e:
|
252 |
+
print(f"Error renaming chat title: {e}")
|
253 |
+
return False
|
254 |
+
|
255 |
+
|
256 |
+
def save_system_prompt(session_id, prompt):
|
257 |
+
try:
|
258 |
+
try:
|
259 |
+
with open(FILE_PATH, "rb") as file:
|
260 |
+
data = pickle.load(file)
|
261 |
+
except (FileNotFoundError, EOFError):
|
262 |
+
data = {}
|
263 |
+
|
264 |
+
now = datetime.now(timezone.utc).isoformat()
|
265 |
+
|
266 |
+
if session_id not in data:
|
267 |
+
return False
|
268 |
+
|
269 |
+
data[session_id]["prompt"] = prompt
|
270 |
+
data[session_id]["lastUpdatedAt"] = now
|
271 |
+
|
272 |
+
with open(FILE_PATH, "wb") as file:
|
273 |
+
pickle.dump(data, file)
|
274 |
+
|
275 |
+
print(f"Saved Prompt : {data[session_id]}")
|
276 |
+
return True
|
277 |
+
|
278 |
+
except Exception as e:
|
279 |
+
print(f"Error saving context entry: {e}")
|
280 |
+
return False
|
281 |
+
|
282 |
+
|
283 |
+
def delete_chat(session_id):
|
284 |
+
try:
|
285 |
+
try:
|
286 |
+
with open(FILE_PATH, "rb") as file:
|
287 |
+
data = pickle.load(file)
|
288 |
+
except (FileNotFoundError, EOFError):
|
289 |
+
data = {}
|
290 |
+
|
291 |
+
if session_id not in data:
|
292 |
+
return True
|
293 |
+
|
294 |
+
data.pop(session_id)
|
295 |
+
|
296 |
+
with open(FILE_PATH, "wb") as file:
|
297 |
+
pickle.dump(data, file)
|
298 |
+
|
299 |
+
if session_id not in data:
|
300 |
+
return True
|
301 |
+
|
302 |
+
return False
|
303 |
+
|
304 |
+
except Exception as e:
|
305 |
+
print(f"Error deleting chat: {e}")
|
306 |
+
return False
|
chat_history.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f5a6e279b1243be5d92db335883c01146efde4af08a42a887b5fd326ed2a3636
|
3 |
+
size 73280
|