PierreBrunelle commited on
Commit
e1aa0dd
1 Parent(s): 514c787

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +167 -0
app.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import io
4
+ import base64
5
+ import uuid
6
+ import pixeltable as pxt
7
+ from pixeltable.iterators import DocumentSplitter
8
+ import numpy as np
9
+ from pixeltable.functions.huggingface import sentence_transformer
10
+ from pixeltable.functions import openai
11
+ from gradio.themes import Monochrome
12
+
13
+ import os
14
+ import getpass
15
+
16
+ # Store API keys
17
+ if 'OPENAI_API_KEY' not in os.environ:
18
+ os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API key:')
19
+
20
+ # Set up embedding function
21
+ @pxt.expr_udf
22
+ def e5_embed(text: str) -> np.ndarray:
23
+ return sentence_transformer(text, model_id='intfloat/e5-large-v2')
24
+
25
+ # Create prompt function
26
+ @pxt.udf
27
+ def create_prompt(top_k_list: list[dict], question: str) -> str:
28
+ concat_top_k = '\n\n'.join(
29
+ elt['text'] for elt in reversed(top_k_list)
30
+ )
31
+ return f'''
32
+ PASSAGES:
33
+ {concat_top_k}
34
+ QUESTION:
35
+ {question}'''
36
+
37
+ def process_files(pdf_files, chunk_limit, chunk_separator):
38
+ # Initialize Pixeltable
39
+ pxt.drop_dir('chatbot_demo', force=True)
40
+ pxt.create_dir('chatbot_demo')
41
+
42
+ # Create a table to store the uploaded PDF documents
43
+ t = pxt.create_table(
44
+ 'chatbot_demo.documents',
45
+ {'document': pxt.DocumentType(nullable=True),
46
+ 'question': pxt.StringType(nullable=True)}
47
+ )
48
+
49
+ # Insert the PDF files into the documents table
50
+ t.insert({'document': file.name} for file in pdf_files if file.name.endswith('.pdf'))
51
+
52
+ # Create a view that splits the documents into smaller chunks
53
+ chunks_t = pxt.create_view(
54
+ 'chatbot_demo.chunks',
55
+ t,
56
+ iterator=DocumentSplitter.create(
57
+ document=t.document,
58
+ separators=chunk_separator,
59
+ limit=chunk_limit if chunk_separator in ["token_limit", "char_limit"] else None,
60
+ metadata='title,heading,sourceline'
61
+ )
62
+ )
63
+
64
+ # Add an embedding index to the chunks for similarity search
65
+ chunks_t.add_embedding_index('text', string_embed=e5_embed)
66
+
67
+ try:
68
+ @chunks_t.query
69
+ def top_k(query_text: str):
70
+ sim = chunks_t.text.similarity(query_text)
71
+ return (
72
+ chunks_t.order_by(sim, asc=False)
73
+ .select(chunks_t.text, sim=sim)
74
+ .limit(5)
75
+ )
76
+ except Exception:
77
+ pass
78
+
79
+ # Add computed columns to the table for context retrieval and prompt creation
80
+ t['question_context'] = chunks_t.top_k(t.question)
81
+ t['prompt'] = create_prompt(
82
+ t.question_context, t.question
83
+ )
84
+
85
+ # Prepare messages for the API
86
+ msgs = [
87
+ {
88
+ 'role': 'system',
89
+ 'content': 'Read the following passages and answer the question based on their contents.'
90
+ },
91
+ {
92
+ 'role': 'user',
93
+ 'content': t.prompt
94
+ }
95
+ ]
96
+
97
+ # Add OpenAI response column
98
+ t['response'] = openai.chat_completions(
99
+ model='gpt-4o-mini-2024-07-18',
100
+ messages=msgs,
101
+ max_tokens=300,
102
+ top_p=0.9,
103
+ temperature=0.7
104
+ )
105
+
106
+ # Extract the answer text from the API response
107
+ t['gpt4omini'] = t.response.choices[0].message.content
108
+
109
+ return "Files processed successfully!"
110
+
111
+ def get_answer(msg):
112
+
113
+ t = pxt.get_table('chatbot_demo.documents')
114
+ chunks_t = pxt.get_table('chatbot_demo.chunks')
115
+
116
+ # Insert the question into the table
117
+ t.insert([{'question': msg}])
118
+
119
+ answer = t.select(t.gpt4omini).tail(1)['gpt4omini'][0]
120
+
121
+ return answer
122
+
123
+ # Gradio interface
124
+ with gr.Blocks(theme=Monochrome()) as demo:
125
+ gr.Markdown(
126
+ """
127
+ <div>
128
+ <img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" alt="Pixeltable" style="max-width: 200px; margin-bottom: 20px;" />
129
+ <h1 style="margin-bottom: 0.5em;">AI Chatbot With Retrieval-Augmented Generation (RAG)</h1>
130
+ </div>
131
+ """
132
+ )
133
+ gr.HTML(
134
+ """
135
+ <p>
136
+ <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none; font-weight: bold;">Pixeltable</a> is a declarative interface for working with text, images, embeddings, and even video, enabling you to store, transform, index, and iterate on data.
137
+ </p>
138
+ """
139
+ )
140
+
141
+ with gr.Row():
142
+ with gr.Column():
143
+ pdf_files = gr.File(label="Upload PDF Documents", file_count="multiple")
144
+ chunk_limit = gr.Slider(minimum=100, maximum=500, value=300, step=5, label="Chunk Size Limit (only used when the separator is token_/char_limit)")
145
+ chunk_separator = gr.Dropdown(
146
+ choices=["token_limit", "char_limit", "sentence", "paragraph", "heading"],
147
+ value="token_limit",
148
+ label="Chunk Separator"
149
+ )
150
+ process_button = gr.Button("Process Files")
151
+ process_output = gr.Textbox(label="Processing Output")
152
+
153
+ with gr.Column():
154
+ chatbot = gr.Chatbot(label="Chat History")
155
+ msg = gr.Textbox(label="Your Question")
156
+ submit = gr.Button("Submit")
157
+
158
+ def respond(message, chat_history):
159
+ bot_message = get_answer(message)
160
+ chat_history.append((message, bot_message))
161
+ return "", chat_history
162
+
163
+ submit.click(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
164
+ process_button.click(process_files, inputs=[pdf_files, chunk_limit, chunk_separator], outputs=[process_output])
165
+
166
+ if __name__ == "__main__":
167
+ demo.launch(debug=True)