Anupam251272 commited on
Commit
d3a91f4
·
verified ·
1 Parent(s): dfb05a7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +307 -0
app.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import torch
3
+ from transformers import pipeline
4
+ import gradio as gr
5
+ import logging
6
+ from typing import List
7
+ import time
8
+ import requests
9
+ from bs4 import BeautifulSoup
10
+ import io
11
+ import tempfile
12
+ import os
13
+ from tqdm import tqdm
14
+
15
+ # Configure logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ class ContentQuestionGenerator:
20
+ def __init__(self):
21
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
22
+ logger.info(f"Using device: {self.device}")
23
+
24
+ self.summarizer = pipeline(
25
+ "summarization",
26
+ model="facebook/bart-large-cnn",
27
+ device=0 if self.device == "cuda" else -1
28
+ )
29
+
30
+ self.question_generator = pipeline(
31
+ "text2text-generation",
32
+ model="lmqg/t5-base-squad-qg",
33
+ device=0 if self.device == "cuda" else -1
34
+ )
35
+
36
+ def process_large_pdf(self, file_obj, chunk_size=50) -> str:
37
+ """Process large PDF files in chunks."""
38
+ try:
39
+ # Create a temporary file to store the PDF
40
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
41
+ if isinstance(file_obj, bytes):
42
+ temp_file.write(file_obj)
43
+ else:
44
+ temp_file.write(file_obj.read())
45
+ temp_file_path = temp_file.name
46
+
47
+ # Open the PDF with PyPDF2
48
+ with open(temp_file_path, 'rb') as file:
49
+ pdf_reader = PyPDF2.PdfReader(file)
50
+ total_pages = len(pdf_reader.pages)
51
+ logger.info(f"Processing PDF with {total_pages} pages")
52
+
53
+ all_text = []
54
+ # Process pages in chunks
55
+ for i in range(0, total_pages, chunk_size):
56
+ chunk_text = ""
57
+ end_page = min(i + chunk_size, total_pages)
58
+
59
+ logger.info(f"Processing pages {i+1} to {end_page}")
60
+ for page_num in range(i, end_page):
61
+ try:
62
+ page = pdf_reader.pages[page_num]
63
+ chunk_text += page.extract_text() + "\n"
64
+ except Exception as e:
65
+ logger.warning(f"Error extracting text from page {page_num + 1}: {str(e)}")
66
+ continue
67
+
68
+ if chunk_text.strip():
69
+ all_text.append(chunk_text)
70
+
71
+ # Free up memory
72
+ del chunk_text
73
+
74
+ # Clean up temporary file
75
+ os.unlink(temp_file_path)
76
+
77
+ return "\n".join(all_text)
78
+
79
+ except Exception as e:
80
+ logger.error(f"Error processing large PDF: {str(e)}")
81
+ if 'temp_file_path' in locals():
82
+ try:
83
+ os.unlink(temp_file_path)
84
+ except:
85
+ pass
86
+ raise
87
+
88
+ def extract_text_from_url(self, url: str) -> str:
89
+ """Extract text content from a webpage."""
90
+ try:
91
+ headers = {
92
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
93
+ }
94
+
95
+ response = requests.get(url, headers=headers, timeout=30)
96
+ response.raise_for_status()
97
+
98
+ soup = BeautifulSoup(response.text, 'html.parser')
99
+
100
+ # Remove unwanted elements
101
+ for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
102
+ element.decompose()
103
+
104
+ # Handle Wikipedia specifically
105
+ if 'wikipedia.org' in url:
106
+ main_content = soup.find('div', {'id': 'mw-content-text'})
107
+ text = ' '.join([p.get_text() for p in (main_content or soup).find_all('p')])
108
+ else:
109
+ text = ' '.join([p.get_text() for p in soup.find_all('p')])
110
+
111
+ text = ' '.join(text.split())
112
+
113
+ if not text.strip():
114
+ raise ValueError("No text content could be extracted from the URL")
115
+
116
+ return text.strip()
117
+
118
+ except Exception as e:
119
+ logger.error(f"Error extracting text from URL: {str(e)}")
120
+ raise ValueError(f"Could not extract text from URL: {str(e)}")
121
+
122
+ def chunk_text(self, text: str, max_chunk_size: int = 1024) -> List[str]:
123
+ """Split text into chunks for processing."""
124
+ chunks = []
125
+ current_chunk = []
126
+ current_size = 0
127
+
128
+ for sentence in text.split('.'):
129
+ sentence = sentence.strip() + '.'
130
+ if current_size + len(sentence) + 1 <= max_chunk_size:
131
+ current_chunk.append(sentence)
132
+ current_size += len(sentence) + 1
133
+ else:
134
+ if current_chunk:
135
+ chunks.append(' '.join(current_chunk))
136
+ current_chunk = [sentence]
137
+ current_size = len(sentence) + 1
138
+
139
+ if current_chunk:
140
+ chunks.append(' '.join(current_chunk))
141
+
142
+ return chunks
143
+
144
+ def summarize_text(self, text: str) -> str:
145
+ """Summarize text with memory-efficient chunking."""
146
+ chunks = self.chunk_text(text)
147
+ summaries = []
148
+
149
+ for chunk in tqdm(chunks, desc="Summarizing text"):
150
+ if len(chunk.strip()) > 50:
151
+ try:
152
+ summary = self.summarizer(chunk,
153
+ max_length=150,
154
+ min_length=40,
155
+ do_sample=False)[0]['summary_text']
156
+ summaries.append(summary)
157
+ except Exception as e:
158
+ logger.warning(f"Error summarizing chunk: {str(e)}")
159
+ continue
160
+
161
+ # Free up memory
162
+ torch.cuda.empty_cache() if torch.cuda.is_available() else None
163
+
164
+ return " ".join(summaries)
165
+
166
+ def generate_questions(self, text: str, num_questions: int = 20) -> List[str]:
167
+ """Generate diverse questions with memory management."""
168
+ try:
169
+ all_questions = set() # Use set to ensure uniqueness
170
+ sentences = text.split('.')
171
+
172
+ for sentence in tqdm(sentences, desc="Generating questions"):
173
+ if len(all_questions) >= num_questions * 2:
174
+ break
175
+
176
+ if len(sentence.strip()) > 30:
177
+ try:
178
+ generated = self.question_generator(
179
+ sentence.strip(),
180
+ max_length=64,
181
+ num_return_sequences=2,
182
+ do_sample=True,
183
+ temperature=0.8
184
+ )
185
+
186
+ for gen in generated:
187
+ question = gen['generated_text'].strip()
188
+ if question.endswith('?') and len(question.split()) > 3:
189
+ all_questions.add(question)
190
+
191
+ # Free up memory
192
+ torch.cuda.empty_cache() if torch.cuda.is_available() else None
193
+
194
+ except Exception as e:
195
+ logger.warning(f"Error generating question: {str(e)}")
196
+ continue
197
+
198
+ # Convert to list and randomize
199
+ questions_list = list(all_questions)
200
+ import random
201
+ random.shuffle(questions_list)
202
+
203
+ return questions_list[:num_questions]
204
+
205
+ except Exception as e:
206
+ logger.error(f"Error generating questions: {str(e)}")
207
+ raise
208
+
209
+ def process_input(self, input_data) -> str:
210
+ """Process either PDF file or URL with progress tracking."""
211
+ try:
212
+ start_time = time.time()
213
+
214
+ # Extract text based on input type
215
+ if isinstance(input_data, str) and (input_data.startswith('http://') or input_data.startswith('https://')):
216
+ logger.info("Processing URL content...")
217
+ text = self.extract_text_from_url(input_data)
218
+ else:
219
+ logger.info("Processing PDF content...")
220
+ text = self.process_large_pdf(input_data)
221
+
222
+ logger.info(f"Extracted {len(text)} characters of text")
223
+
224
+ # Process in chunks with memory management
225
+ logger.info("Summarizing content...")
226
+ summarized_text = self.summarize_text(text)
227
+ logger.info(f"Summarized to {len(summarized_text)} characters")
228
+
229
+ logger.info("Generating questions...")
230
+ questions = self.generate_questions(summarized_text)
231
+ logger.info(f"Generated {len(questions)} questions")
232
+
233
+ if not questions:
234
+ return "Could not generate any valid questions from the content."
235
+
236
+ formatted_output = "\n".join(f"{i+1}. {q}" for i, q in enumerate(questions))
237
+ processing_time = time.time() - start_time
238
+ logger.info(f"Total processing time: {processing_time:.2f} seconds")
239
+
240
+ return formatted_output
241
+
242
+ except Exception as e:
243
+ error_msg = f"Error processing input: {str(e)}"
244
+ logger.error(error_msg)
245
+ return f"An error occurred: {error_msg}"
246
+
247
+ def create_gradio_interface():
248
+ """Create and configure Gradio interface."""
249
+ generator = ContentQuestionGenerator()
250
+
251
+ def process_input(file, url):
252
+ if file is None and not url:
253
+ return "Please provide either a PDF file or a webpage URL."
254
+ if file is not None and url:
255
+ return "Please provide either a PDF file or a URL, not both."
256
+
257
+ try:
258
+ if url:
259
+ if not (url.startswith('http://') or url.startswith('https://')):
260
+ return "Please provide a valid URL starting with http:// or https://"
261
+ return generator.process_input(url)
262
+
263
+ return generator.process_input(file)
264
+
265
+ except Exception as e:
266
+ logger.error("Error processing input:", exc_info=True)
267
+ return f"Error processing input: {str(e)}"
268
+
269
+ interface = gr.Interface(
270
+ fn=process_input,
271
+ inputs=[
272
+ gr.File(
273
+ label="Upload PDF Document",
274
+ type="binary",
275
+ file_types=[".pdf"],
276
+ file_count="single"
277
+ ),
278
+ gr.Textbox(
279
+ label="Or enter webpage URL",
280
+ placeholder="https://example.com/page or https://en.wikipedia.org/wiki/Topic"
281
+ )
282
+ ],
283
+ outputs=gr.Textbox(
284
+ label="Generated Questions",
285
+ lines=20
286
+ ),
287
+ title="Content Question Generator",
288
+ description="""
289
+ Upload any size PDF document or provide a webpage URL to generate relevant questions.
290
+
291
+ Features:
292
+ - Supports large PDF files (100MB+)
293
+ - Works with any webpage URL
294
+ - Special handling for Wikipedia pages
295
+ - Generates 20 unique random questions
296
+ - Shows progress during processing
297
+
298
+ Note: Large files may take several minutes to process.
299
+ """,
300
+ allow_flagging="never"
301
+ )
302
+
303
+ return interface
304
+
305
+ if __name__ == "__main__":
306
+ interface = create_gradio_interface()
307
+ interface.queue().launch(share=True)