pdfChatter

Runtime error

App Files Files Community

king007 commited on Jun 18, 2023

Commit

c6e29e8

•

1 Parent(s): adadda8

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -11

app.py CHANGED Viewed

@@ -17,6 +17,10 @@ def preprocess(text):
     text = re.sub('\s+', ' ', text)
     return text
 def pdf_to_text(path, start_page=1, end_page=None):
     doc = fitz.open(path)
@@ -26,21 +30,33 @@ def pdf_to_text(path, start_page=1, end_page=None):
         end_page = total_pages
     text_list = []
     for i in range(start_page-1, end_page):
         text = doc.load_page(i).get_text("text")
         text = preprocess(text)
         text_list.append(text)
     doc.close()
-    return text_list
 def text_to_chunks(texts, word_length=150, start_page=1):
     text_toks = [t.split(' ') for t in texts]
     page_nums = []
     chunks = []
     for idx, words in enumerate(text_toks):
         for i in range(0, len(words), word_length):
             chunk = words[i:i+word_length]
@@ -51,6 +67,10 @@ def text_to_chunks(texts, word_length=150, start_page=1):
             chunk = ' '.join(chunk).strip()
             chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
             chunks.append(chunk)
     return chunks
@@ -93,10 +113,11 @@ class SemanticSearch:
 def load_recommender(path, start_page=1):
     global recommender
-    texts = pdf_to_text(path, start_page=start_page)
-    chunks = text_to_chunks(texts, start_page=start_page)
     recommender.fit(chunks)
-    return 'Corpus Loaded.'
 def generate_text(openAI_key,prompt, engine="text-davinci-003"):
@@ -142,19 +163,22 @@ def question_answer(url, file, question,openAI_key):
     if url.strip() != '' and file != None:
         return '[ERROR]: Both URL and PDF is provided. Please provide only one (eiter URL or PDF).'
     if url.strip() != '':
         glob_url = url
         download_pdf(glob_url, 'corpus.pdf')
-        load_recommender('corpus.pdf')
     else:
         old_file_name = file.name
         file_name = file.name
         file_name = file_name[:-12] + file_name[-4:]
         os.rename(old_file_name, file_name)
-        load_recommender(file_name)
     if question.strip() == '':
         return '[ERROR]: Question field is empty'

     text = re.sub('\s+', ' ', text)
     return text
+def word_count0(str):
+    words = str.split()
+    return len(words)
 def pdf_to_text(path, start_page=1, end_page=None):
     doc = fitz.open(path)
         end_page = total_pages
     text_list = []
+    #
+    text_len = 0
+    #
+    pdf_parse_status = 1
+    #
     for i in range(start_page-1, end_page):
         text = doc.load_page(i).get_text("text")
         text = preprocess(text)
         text_list.append(text)
+        #
+        text_len = text_len + word_count0(text)
     doc.close()
+    if(text_len>10):
+        pdf_parse_status = 0
+        return [], pdf_parse_status
+    return text_list, pdf_parse_status
 def text_to_chunks(texts, word_length=150, start_page=1):
     text_toks = [t.split(' ') for t in texts]
     page_nums = []
     chunks = []
+    #
+    text_len = 0
+    #
+    pdf_parse_status = 1
+    #
     for idx, words in enumerate(text_toks):
         for i in range(0, len(words), word_length):
             chunk = words[i:i+word_length]
             chunk = ' '.join(chunk).strip()
             chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
             chunks.append(chunk)
+            text_len = text_len + word_count0(chunk)
+    if(text_len>10):
+        pdf_parse_status = 0
+        return [], pdf_parse_status
     return chunks
 def load_recommender(path, start_page=1):
     global recommender
+    pdf_parse_status = 1
+    texts, pdf_parse_status = pdf_to_text(path, start_page=start_page)
+    chunks, pdf_parse_status  = text_to_chunks(texts, start_page=start_page)
     recommender.fit(chunks)
+    return 'Corpus Loaded.', pdf_parse_status
 def generate_text(openAI_key,prompt, engine="text-davinci-003"):
     if url.strip() != '' and file != None:
         return '[ERROR]: Both URL and PDF is provided. Please provide only one (eiter URL or PDF).'
+    #
+    pdf_parse_status = 1
     if url.strip() != '':
         glob_url = url
         download_pdf(glob_url, 'corpus.pdf')
+        load_resp, pdf_parse_status = load_recommender('corpus.pdf')
     else:
         old_file_name = file.name
         file_name = file.name
         file_name = file_name[:-12] + file_name[-4:]
         os.rename(old_file_name, file_name)
+        load_resp, pdf_parse_status = load_recommender(file_name)
+    #
+    if pdf_parse_status == 0:
+        return 'CODE:1004, MSG:PDF FILE TOO LARGE'
     if question.strip() == '':
         return '[ERROR]: Question field is empty'