oliverwang15 commited on
Commit
acc114a
·
1 Parent(s): 4831f4c

update on the .pdf format support

Browse files
Files changed (4) hide show
  1. app.py +1 -1
  2. backend.py +19 -3
  3. openai.py +0 -1
  4. requirements.txt +3 -1
app.py CHANGED
@@ -31,7 +31,7 @@ with gr.Blocks(theme="dark") as demo:
31
  label='Enter your OpenAI API key here',
32
  type='password')
33
 
34
- file = gr.File(label='Upload your .txt file here', file_types=['.txt'], file_count = 'multiple')
35
 
36
  questions = gr.CheckboxGroup(choices = QUESTIONS, value = QUESTIONS, label="Questions", info="Please select the question you want to ask")
37
 
 
31
  label='Enter your OpenAI API key here',
32
  type='password')
33
 
34
+ file = gr.File(label='Upload your .txt or .pdf file here', file_types=['.txt', '.pdf'], file_count = 'multiple')
35
 
36
  questions = gr.CheckboxGroup(choices = QUESTIONS, value = QUESTIONS, label="Questions", info="Please select the question you want to ask")
37
 
backend.py CHANGED
@@ -21,15 +21,28 @@ class Backend:
21
  raise gr.Error("You need to upload a file first")
22
  return text
23
 
 
 
 
 
 
 
24
  def read_file(self, files):
25
  # read the file
26
  text_list = []
27
  self.filename_list = []
28
  if files is not None:
29
  for file in files:
30
- with open(file.name, 'r', encoding='utf-8') as f:
31
- text_list.append(f.read())
32
- self.filename_list.append(file.name.split('\\')[-1])
 
 
 
 
 
 
 
33
  else:
34
  raise gr.Error("You need to upload a file first")
35
  return text_list
@@ -205,11 +218,13 @@ class Backend:
205
  del self.clicked_correct_reference
206
 
207
  self.current_passage += 1
 
208
 
209
  if self.current_passage >= self.total_passages:
210
  # self.current_passage -= 1
211
  return "No more passages!", "No more passages!", "No more passages!", "No more passages!", 'No more passages!', 'No more passages!', 'Still need to click the button above to save the results', None, None
212
  else:
 
213
  gpt_res = self.res_list[self.current_passage]
214
  self.gpt_result = gpt_res
215
  res = self.gpt_result[f'Question {self.current_question + 1}']
@@ -236,6 +251,7 @@ class Backend:
236
  # self.current_passage += 1
237
  return "No more passages!", "No more passages!", "No more passages!", "No more passages!", 'No more passages!', 'No more passages!', 'Still need to click the button above to save the results', None, None
238
  else:
 
239
  gpt_res = self.res_list[self.current_passage]
240
  self.gpt_result = gpt_res
241
  res = self.gpt_result[f'Question {self.current_question + 1}']
 
21
  raise gr.Error("You need to upload a file first")
22
  return text
23
 
24
+ def phrase_pdf(self, file_path):
25
+ from langchain.document_loaders import UnstructuredPDFLoader
26
+ loader = UnstructuredPDFLoader(file_path, model = 'elements')
27
+ file = loader.load()
28
+ return file[0].page_content
29
+
30
  def read_file(self, files):
31
  # read the file
32
  text_list = []
33
  self.filename_list = []
34
  if files is not None:
35
  for file in files:
36
+ if file.name.split('.')[-1] == 'pdf':
37
+ # convert pdf to txt
38
+ text = self.phrase_pdf(file.name)
39
+
40
+ else:
41
+ with open(file.name, 'r', encoding='utf-8') as f:
42
+ text = f.read()
43
+
44
+ text_list.append(text)
45
+ self.filename_list.append(file.name.split('\\')[-1])
46
  else:
47
  raise gr.Error("You need to upload a file first")
48
  return text_list
 
218
  del self.clicked_correct_reference
219
 
220
  self.current_passage += 1
221
+
222
 
223
  if self.current_passage >= self.total_passages:
224
  # self.current_passage -= 1
225
  return "No more passages!", "No more passages!", "No more passages!", "No more passages!", 'No more passages!', 'No more passages!', 'Still need to click the button above to save the results', None, None
226
  else:
227
+ self.text = self.text_list[self.current_passage]
228
  gpt_res = self.res_list[self.current_passage]
229
  self.gpt_result = gpt_res
230
  res = self.gpt_result[f'Question {self.current_question + 1}']
 
251
  # self.current_passage += 1
252
  return "No more passages!", "No more passages!", "No more passages!", "No more passages!", 'No more passages!', 'No more passages!', 'Still need to click the button above to save the results', None, None
253
  else:
254
+ self.text = self.text_list[self.current_passage]
255
  gpt_res = self.res_list[self.current_passage]
256
  self.gpt_result = gpt_res
257
  res = self.gpt_result[f'Question {self.current_question + 1}']
openai.py CHANGED
@@ -32,7 +32,6 @@ class OpenAI:
32
  }, headers={
33
  'Authorization': f"Bearer {api_key}"
34
  })
35
- print(resp)
36
  self.history.append(resp.json()['choices'][0]['message'])
37
  res = resp.json()['choices'][0]['message']['content']
38
 
 
32
  }, headers={
33
  'Authorization': f"Bearer {api_key}"
34
  })
 
35
  self.history.append(resp.json()['choices'][0]['message'])
36
  res = resp.json()['choices'][0]['message']['content']
37
 
requirements.txt CHANGED
@@ -1,2 +1,4 @@
1
  fuzzywuzzy
2
- openpyxl
 
 
 
1
  fuzzywuzzy
2
+ openpyxl
3
+ "unstructured[all-docs]"
4
+ langchain