FreestylerAI commited on
Commit
9d0ffca
·
verified ·
1 Parent(s): fbf0ed4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +736 -813
app.py CHANGED
@@ -1,813 +1,736 @@
1
- import os
2
- import json
3
- import pandas as pd
4
- import gradio as gr
5
- import spaces
6
- from transformers import AutoModelForCausalLM, AutoTokenizer
7
- import torch
8
- import csv
9
- import yaml
10
- from typing import List, Dict, Any
11
- import random
12
- from pypdf import PdfReader
13
- import re
14
- import tempfile
15
- from huggingface_hub import HfApi
16
-
17
- # Configuration
18
- DEFAULT_MODEL = "databricks/dolly-v2-3b" # Smaller, more suitable for Spaces
19
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Try to use CUDA if available
20
- MAX_NEW_TOKENS = 512
21
- TEMPERATURE = 0.7
22
- HF_TOKEN = os.environ.get("HF_TOKEN") if os.environ.get("HF_TOKEN") else None # Get token from environment variables
23
- MAX_RAM_GB = 45 # Set maximum RAM usage to 45GB (below the 70GB limit)
24
-
25
- # Create offload folder for model memory management
26
- os.makedirs("offload_folder", exist_ok=True)
27
-
28
- # Setup RAM monitoring
29
- def get_process_memory_usage():
30
- """Get the current memory usage of this process in GB"""
31
- import psutil
32
- process = psutil.Process(os.getpid())
33
- return process.memory_info().rss / (1024 * 1024 * 1024) # Convert to GB
34
-
35
- class PdfExtractor:
36
- """Extract text content from PDF files"""
37
-
38
- @staticmethod
39
- def extract_text_from_pdf(pdf_file):
40
- """Extract text from a PDF file"""
41
- try:
42
- reader = PdfReader(pdf_file)
43
- text = ""
44
-
45
- for page in reader.pages:
46
- text += page.extract_text() + "\n"
47
-
48
- return text
49
- except Exception as e:
50
- print(f"Error extracting text from PDF: {e}")
51
- return None
52
-
53
- @staticmethod
54
- def clean_text(text):
55
- """Clean and preprocess extracted text"""
56
- if not text:
57
- return ""
58
-
59
- # Replace multiple newlines with single newline
60
- text = re.sub(r'\n+', '\n', text)
61
-
62
- # Replace multiple spaces with single space
63
- text = re.sub(r'\s+', ' ', text)
64
-
65
- return text.strip()
66
-
67
- @staticmethod
68
- def chunk_text(text, max_chunk_size=1000, overlap=100):
69
- """Split text into chunks of specified size with overlap"""
70
- if not text:
71
- return []
72
-
73
- chunks = []
74
- start = 0
75
- text_length = len(text)
76
-
77
- while start < text_length:
78
- end = min(start + max_chunk_size, text_length)
79
-
80
- # If we're not at the end, try to break at a sentence or paragraph
81
- if end < text_length:
82
- # Look for sentence breaks (period, question mark, exclamation mark followed by space)
83
- sentence_break = max(
84
- text.rfind('. ', start, end),
85
- text.rfind('? ', start, end),
86
- text.rfind('! ', start, end),
87
- text.rfind('\n', start, end)
88
- )
89
-
90
- if sentence_break > start + max_chunk_size // 2:
91
- end = sentence_break + 1
92
-
93
- chunks.append(text[start:end].strip())
94
- start = end - overlap # Create overlap with previous chunk
95
-
96
- return chunks
97
-
98
- class SyntheticDataGenerator:
99
- def __init__(self, model_name=DEFAULT_MODEL):
100
- self.model_name = model_name
101
- self.model = None
102
- self.tokenizer = None
103
- self.fallback_models = [
104
- "databricks/dolly-v2-3b", # Smallest, most reliable model as primary fallback
105
- "EleutherAI/gpt-neo-1.3B", # Second fallback option
106
- ]
107
- # Don't try to load the model in init - we'll load it when needed
108
- # This prevents initialization errors from blocking the app startup
109
-
110
- def load_model(self):
111
- """Load the specified model or fall back to a smaller model if loading fails"""
112
- # Clear CUDA cache if using GPU to prevent memory fragmentation
113
- if torch.cuda.is_available():
114
- torch.cuda.empty_cache()
115
- # Try initializing CUDA explicitly to catch early errors
116
- try:
117
- torch.tensor([1.0], device="cuda")
118
- except Exception as e:
119
- print(f"CUDA initialization error: {e}")
120
-
121
- models_to_try = [self.model_name]
122
-
123
- # Add fallback models only if the requested model isn't already in the fallback list
124
- if self.model_name not in self.fallback_models:
125
- models_to_try.extend(self.fallback_models)
126
-
127
- for model_name in models_to_try:
128
- try:
129
- print(f"Loading model {model_name} on {DEVICE}...")
130
-
131
- # Add token for authentication if available
132
- tokenizer_kwargs = {}
133
- model_kwargs = {
134
- "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
135
- "device_map": "auto" if torch.cuda.is_available() else None,
136
- "low_cpu_mem_usage": True, # Added to reduce memory usage on CPU
137
- "offload_folder": "offload_folder" # Add offload folder for large models
138
- }
139
-
140
- # Handle potential CUDA out-of-memory errors with smaller settings
141
- if torch.cuda.is_available():
142
- try:
143
- # Check available GPU memory
144
- free_memory, total_memory = torch.cuda.mem_get_info()
145
- free_memory_gb = free_memory / (1024**3)
146
- total_memory_gb = total_memory / (1024**3)
147
- print(f"GPU memory: {free_memory_gb:.2f}GB free out of {total_memory_gb:.2f}GB total")
148
-
149
- # If we're running low on memory (this is common in Spaces), use more aggressive memory saving
150
- model_kwargs["max_memory"] = {0: f"{max(free_memory_gb*0.8, 0.5)}GB", "cpu": "8GB"}
151
-
152
- # For smaller GPUs or when memory is constrained, use more aggressive offloading
153
- if free_memory_gb < 4.0: # Less than 4GB free
154
- print("Low GPU memory detected. Using CPU offloading...")
155
- # More conservative memory map to prevent ZeroGPU errors
156
- model_kwargs["device_map"] = "auto"
157
- model_kwargs["offload_state_dict"] = True # More aggressive offloading
158
- except Exception as memory_check_error:
159
- print(f"Error checking GPU memory: {memory_check_error}")
160
- # Continue with default settings but with safeguards
161
- model_kwargs["device_map"] = "auto" # Let the library decide the best mapping
162
-
163
- # Add token for authentication if available and model is gated
164
- if HF_TOKEN:
165
- tokenizer_kwargs["token"] = HF_TOKEN
166
- model_kwargs["token"] = HF_TOKEN
167
- print("Using Hugging Face token for authentication")
168
-
169
- # Load tokenizer with safeguards against ZeroGPU issues
170
- try:
171
- self.tokenizer = AutoTokenizer.from_pretrained(model_name, **tokenizer_kwargs)
172
- except Exception as tokenizer_error:
173
- print(f"Error loading tokenizer: {tokenizer_error}")
174
- # Try loading with additional safety settings
175
- tokenizer_kwargs["local_files_only"] = False
176
- tokenizer_kwargs["revision"] = "main"
177
- self.tokenizer = AutoTokenizer.from_pretrained(model_name, **tokenizer_kwargs)
178
-
179
- # Load the model with ZeroGPU error prevention
180
- try:
181
- self.model = AutoModelForCausalLM.from_pretrained(
182
- model_name,
183
- **model_kwargs
184
- )
185
- except RuntimeError as e:
186
- if "CUDA" in str(e) or "GPU" in str(e) or "out of memory" in str(e):
187
- print(f"CUDA error loading model: {e}")
188
- # Fall back to CPU if GPU fails
189
- print("Falling back to CPU for model loading")
190
- model_kwargs["device_map"] = {"": "cpu"}
191
- model_kwargs["torch_dtype"] = torch.float32
192
- self.model = AutoModelForCausalLM.from_pretrained(
193
- model_name,
194
- **model_kwargs
195
- )
196
- else:
197
- raise
198
-
199
- # Ensure model is on the right device if not using device_map="auto"
200
- if not torch.cuda.is_available():
201
- self.model = self.model.to(DEVICE)
202
-
203
- # If we loaded a fallback model instead of the requested one, update the model_name
204
- if model_name != self.model_name:
205
- print(f"Successfully loaded fallback model {model_name} instead of {self.model_name}")
206
- self.model_name = model_name
207
- else:
208
- print(f"Model loaded successfully on {DEVICE}")
209
-
210
- return True
211
- except Exception as e:
212
- print(f"Error loading model {model_name}: {e}")
213
- self.model = None
214
- self.tokenizer = None
215
- # Continue to the next model in the list
216
-
217
- # If we get here, all models failed
218
- print("All models failed to load")
219
- return False
220
-
221
- def generate_qa_prompt(self, context, num_questions=3, include_tags=True, difficulty_levels=True):
222
- """Generate a prompt for creating Q&A pairs from context."""
223
- tag_instruction = ""
224
- if include_tags:
225
- tag_instruction = "Add 1-3 tags for each question that categorize the topic or subject matter."
226
-
227
- difficulty_instruction = ""
228
- if difficulty_levels:
229
- difficulty_instruction = "For each question, assign a difficulty level (easy, medium, or hard)."
230
-
231
- prompt = f"""Task: Based on the following text, generate {num_questions} question and answer pairs that would be useful for comprehension testing or knowledge assessment.
232
-
233
- CONTEXT:
234
- {context}
235
-
236
- For each question:
237
- 1. Write a clear, specific question about the information in the text
238
- 2. Provide the correct answer to the question, citing relevant details from the text
239
- 3. {tag_instruction}
240
- 4. {difficulty_instruction}
241
-
242
- Format each Q&A pair as a JSON object with the following structure:
243
- {{
244
- "question": "The question text",
245
- "answer": "The answer text",
246
- "tags": ["tag1", "tag2"],
247
- "difficulty": "easy/medium/hard"
248
- }}
249
-
250
- Return all Q&A pairs in a JSON array.
251
- """
252
- return prompt
253
-
254
- def generate_data(self, prompt, num_samples=1):
255
- """Generate synthetic data using the loaded model."""
256
- if not self.model or not self.tokenizer:
257
- return ["Error: Model not loaded properly. Please try again with a different model."]
258
-
259
- outputs = []
260
- for sample_idx in range(num_samples):
261
- try:
262
- # Clear CUDA cache before generating to free up memory
263
- if torch.cuda.is_available():
264
- torch.cuda.empty_cache()
265
-
266
- # ZeroGPU errors often occur in generate() calls
267
- # To mitigate this, try multiple approaches in sequence
268
- inputs = self.tokenizer(prompt, return_tensors="pt").to(DEVICE)
269
-
270
- try:
271
- # First try: Standard generation with conservative settings
272
- with torch.no_grad():
273
- output = self.model.generate(
274
- **inputs,
275
- max_new_tokens=MAX_NEW_TOKENS,
276
- temperature=TEMPERATURE,
277
- do_sample=True,
278
- pad_token_id=self.tokenizer.eos_token_id,
279
- num_beams=1, # Use greedy decoding instead of beam search
280
- early_stopping=True,
281
- no_repeat_ngram_size=3 # Prevent repetition
282
- )
283
-
284
- decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
285
- except (RuntimeError, Exception) as e:
286
- if "CUDA" in str(e) or "GPU" in str(e) or "ZeroGPU" in str(e):
287
- print(f"GPU error during generation: {e}")
288
- print("Falling back to CPU generation...")
289
-
290
- # Move everything to CPU
291
- inputs = {k: v.to('cpu') for k, v in inputs.items()}
292
-
293
- # Create CPU copy of the model if we were using GPU
294
- if torch.cuda.is_available():
295
- # Temporarily move model to CPU for this generation
296
- model_cpu = self.model.to('cpu')
297
-
298
- with torch.no_grad():
299
- output = model_cpu.generate(
300
- **inputs,
301
- max_new_tokens=MAX_NEW_TOKENS,
302
- temperature=TEMPERATURE,
303
- do_sample=True,
304
- pad_token_id=self.tokenizer.eos_token_id,
305
- num_return_sequences=1,
306
- max_length=MAX_NEW_TOKENS + inputs['input_ids'].shape[1]
307
- )
308
- decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
309
-
310
- # Move model back to CUDA for future calls
311
- self.model = self.model.to(DEVICE)
312
- else:
313
- # Already on CPU, try with reduced parameters
314
- with torch.no_grad():
315
- output = self.model.generate(
316
- **inputs,
317
- max_new_tokens=min(256, MAX_NEW_TOKENS), # Reduce token count
318
- temperature=0.5, # Lower temperature
319
- do_sample=False, # No sampling
320
- num_return_sequences=1,
321
- pad_token_id=self.tokenizer.eos_token_id
322
- )
323
- decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
324
- else:
325
- # Re-raise non-CUDA errors
326
- raise
327
-
328
- # Extract only the generated part (remove prompt)
329
- prompt_text = self.tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
330
- generated_text = decoded_output[len(prompt_text):].strip()
331
- outputs.append(generated_text)
332
-
333
- # Clear CUDA cache between samples
334
- if torch.cuda.is_available():
335
- torch.cuda.empty_cache()
336
-
337
- except Exception as e:
338
- error_msg = f"Error generating sample {sample_idx+1}: {str(e)}"
339
- print(error_msg)
340
- outputs.append(f"Error: {error_msg}")
341
-
342
- return outputs
343
-
344
- def parse_json_data(self, generated_text):
345
- """Extract and parse JSON from generated text."""
346
- try:
347
- # Find JSON-like content (between [ and ])
348
- start_idx = generated_text.find('[')
349
- end_idx = generated_text.rfind(']') + 1
350
-
351
- if start_idx >= 0 and end_idx > start_idx:
352
- json_str = generated_text[start_idx:end_idx]
353
- return json.loads(json_str)
354
-
355
- # Try to find single object format
356
- start_idx = generated_text.find('{')
357
- end_idx = generated_text.rfind('}') + 1
358
-
359
- if start_idx >= 0 and end_idx > start_idx:
360
- json_str = generated_text[start_idx:end_idx]
361
- return json.loads(json_str)
362
-
363
- print(f"Could not find JSON content in: {generated_text}")
364
- return None
365
- except json.JSONDecodeError as e:
366
- print(f"JSON parse error: {e}")
367
- print(f"Problematic text: {generated_text}")
368
-
369
- # Try to find and fix common JSON formatting errors
370
- try:
371
- # Replace single quotes with double quotes
372
- json_str = generated_text[start_idx:end_idx].replace("'", "\"")
373
- return json.loads(json_str)
374
- except:
375
- pass
376
-
377
- # If still failing, try to extract individual JSON objects
378
- try:
379
- pattern = r'\{[^{}]*\}'
380
- matches = re.findall(pattern, generated_text)
381
- if matches:
382
- results = []
383
- for match in matches:
384
- try:
385
- # Replace single quotes with double quotes
386
- fixed_match = match.replace("'", "\"")
387
- obj = json.loads(fixed_match)
388
- results.append(obj)
389
- except:
390
- continue
391
- if results:
392
- return results
393
- except:
394
- pass
395
-
396
- return None
397
-
398
- def generate_qa_from_pdf_chunk(self, chunk, num_questions=3, include_tags=True, difficulty_levels=True):
399
- """Generate Q&A pairs from a PDF text chunk."""
400
- if not self.model or not self.tokenizer:
401
- return [], "Error: Model not loaded properly. Please try again with a different model."
402
-
403
- if not chunk or len(chunk.strip()) < 100: # Skip very small chunks
404
- return [], "Chunk too small to generate meaningful Q&A pairs."
405
-
406
- prompt = self.generate_qa_prompt(chunk, num_questions, include_tags, difficulty_levels)
407
- raw_outputs = self.generate_data(prompt, num_samples=1)
408
- raw_output = raw_outputs[0]
409
-
410
- parsed_data = self.parse_json_data(raw_output)
411
-
412
- # Ensure parsed data is a list
413
- if parsed_data and isinstance(parsed_data, dict):
414
- parsed_data = [parsed_data]
415
-
416
- # Return both the parsed data and raw output for debugging
417
- return parsed_data, raw_output
418
-
419
- def format_data_preview(data):
420
- """Format the data for preview in the UI."""
421
- if isinstance(data, list):
422
- if len(data) > 0 and isinstance(data[0], dict):
423
- # Convert list of dicts to DataFrame for better display
424
- return pd.DataFrame(data).to_string()
425
- else:
426
- return json.dumps(data, indent=2)
427
- elif isinstance(data, dict):
428
- return json.dumps(data, indent=2)
429
- else:
430
- return str(data)
431
-
432
- def save_data(data, format, filename_prefix):
433
- """Save data to a file in the specified format."""
434
- os.makedirs("synthetic_data", exist_ok=True)
435
- timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
436
- filename = f"synthetic_data/{filename_prefix}_{timestamp}"
437
-
438
- if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
439
- df = pd.DataFrame(data)
440
-
441
- if format.lower() == "csv":
442
- full_filename = f"{filename}.csv"
443
- df.to_csv(full_filename, index=False)
444
- elif format.lower() == "json":
445
- full_filename = f"{filename}.json"
446
- with open(full_filename, "w") as f:
447
- json.dump(data, f, indent=2)
448
- elif format.lower() == "excel":
449
- full_filename = f"{filename}.xlsx"
450
- df.to_excel(full_filename, index=False)
451
- else:
452
- full_filename = f"{filename}.txt"
453
- with open(full_filename, "w") as f:
454
- f.write(str(data))
455
- else:
456
- full_filename = f"{filename}.{format.lower()}"
457
- with open(full_filename, "w") as f:
458
- if format.lower() == "json":
459
- json.dump(data, f, indent=2)
460
- else:
461
- f.write(str(data))
462
-
463
- return full_filename
464
-
465
- def load_models():
466
- """Return a list of available models."""
467
- return [
468
- "databricks/dolly-v2-3b",
469
- "databricks/dolly-v2-7b",
470
- "EleutherAI/gpt-neo-1.3B",
471
- "EleutherAI/gpt-neo-2.7B",
472
- "tiiuae/falcon-7b-instruct"
473
- ]
474
-
475
- @spaces.GPU
476
- def process_pdf_generate_qa(pdf_file, model_name, num_questions_per_chunk, include_tags, include_difficulty, output_file_format, progress=None):
477
- """Process a PDF file and generate Q&A pairs from its content."""
478
- if pdf_file is None:
479
- return None, "Error: No PDF file uploaded", "", "No file provided"
480
-
481
- try:
482
- # Check RAM usage at start
483
- current_ram_usage = get_process_memory_usage()
484
- print(f"Starting RAM usage: {current_ram_usage:.2f}GB")
485
-
486
- # Clear CUDA cache before starting
487
- if torch.cuda.is_available():
488
- torch.cuda.empty_cache()
489
-
490
- # Initialize extractor and generator
491
- extractor = PdfExtractor()
492
- generator = SyntheticDataGenerator(model_name)
493
-
494
- # Wrap model loading in try-except to handle errors
495
- try:
496
- load_success = generator.load_model()
497
- if not load_success:
498
- return None, "Error: Failed to load the model. Please try again with a different model.", "", "Model loading failed"
499
- except Exception as e:
500
- if "ZeroGPU" in str(e) or "GPU task aborted" in str(e) or "CUDA" in str(e):
501
- print(f"GPU error during model loading: {e}. Trying with a smaller model...")
502
- # If we get a ZeroGPU error, immediately try the smallest model
503
- generator.model_name = "EleutherAI/gpt-neo-1.3B" # Use smallest model as emergency fallback
504
- load_success = generator.load_model()
505
- if not load_success:
506
- return None, "Error: Failed to load any model even after fallback. Please try again later.", "", "Model loading failed"
507
- else:
508
- # Re-raise other errors
509
- raise
510
-
511
- # Check RAM usage after model loading
512
- ram_after_model = get_process_memory_usage()
513
- print(f"RAM usage after model loading: {ram_after_model:.2f}GB")
514
-
515
- # Save PDF temporarily if it's a file object
516
- if hasattr(pdf_file, 'name'):
517
- # It's already a file path
518
- pdf_path = pdf_file.name
519
- else:
520
- # Create a temporary file
521
- with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
522
- tmp.write(pdf_file)
523
- pdf_path = tmp.name
524
-
525
- # Extract text from PDF
526
- pdf_text = extractor.extract_text_from_pdf(pdf_path)
527
-
528
- if not pdf_text:
529
- return None, "Failed to extract text from PDF", "", "No data generated"
530
-
531
- # Clean and chunk the text - reduce chunk size to use less memory
532
- cleaned_text = extractor.clean_text(pdf_text)
533
- chunks = extractor.chunk_text(cleaned_text, max_chunk_size=400, overlap=30)
534
-
535
- # Check RAM after PDF processing
536
- ram_after_pdf = get_process_memory_usage()
537
- print(f"RAM usage after PDF processing: {ram_after_pdf:.2f}GB, found {len(chunks)} chunks")
538
-
539
- # If we're approaching the RAM limit already, reduce batch size
540
- batch_size = 3 # Default
541
- if ram_after_pdf > MAX_RAM_GB * 0.7: # If already using 70% of our limit
542
- batch_size = 1 # Process one chunk at a time
543
- print(f"High RAM usage detected ({ram_after_pdf:.2f}GB), reducing batch size to 1")
544
- elif ram_after_pdf > MAX_RAM_GB * 0.5: # If using 50% of our limit
545
- batch_size = 2 # Process two chunks at a time
546
- print(f"Moderate RAM usage detected ({ram_after_pdf:.2f}GB), reducing batch size to 2")
547
-
548
- # Generate Q&A pairs for each chunk
549
- all_qa_pairs = []
550
- all_raw_outputs = []
551
-
552
- total_chunks = len(chunks)
553
-
554
- # Process chunks in smaller batches to avoid memory buildup
555
- for i in range(0, total_chunks, batch_size):
556
- # Get the current batch of chunks
557
- batch_chunks = chunks[i:min(i+batch_size, total_chunks)]
558
-
559
- # Process each chunk in the batch
560
- for j, chunk in enumerate(batch_chunks):
561
- chunk_index = i + j
562
-
563
- if progress is not None:
564
- progress(chunk_index / total_chunks, f"Processing chunk {chunk_index+1}/{total_chunks}")
565
-
566
- # Check if we're approaching RAM limit
567
- current_ram = get_process_memory_usage()
568
- if current_ram > MAX_RAM_GB * 0.9: # Over 90% of our limit
569
- print(f"WARNING: High RAM usage detected: {current_ram:.2f}GB - force releasing memory")
570
- import gc
571
- gc.collect() # Force garbage collection
572
- if torch.cuda.is_available():
573
- torch.cuda.empty_cache()
574
-
575
- # If still too high after garbage collection, abort batch processing
576
- current_ram = get_process_memory_usage()
577
- if current_ram > MAX_RAM_GB * 0.95: # Still dangerously high
578
- print(f"CRITICAL: RAM usage too high ({current_ram:.2f}GB), stopping processing")
579
- break
580
-
581
- # Clear CUDA cache between chunks
582
- if torch.cuda.is_available():
583
- torch.cuda.empty_cache()
584
-
585
- try:
586
- qa_pairs, raw_output = generator.generate_qa_from_pdf_chunk(
587
- chunk,
588
- num_questions=num_questions_per_chunk,
589
- include_tags=include_tags,
590
- difficulty_levels=include_difficulty
591
- )
592
- except Exception as e:
593
- error_type = str(e)
594
- if "CUDA" in error_type or "GPU" in error_type or "ZeroGPU" in error_type:
595
- print(f"GPU error during generation for chunk {chunk_index+1}: {e}")
596
- # Fall back to CPU for this specific generation
597
- raw_output = f"Error in chunk {chunk_index+1}: {str(e)}. Skipping..."
598
- qa_pairs = None
599
- elif "memory" in error_type.lower() or "ram" in error_type.lower():
600
- print(f"Memory error processing chunk {chunk_index+1}: {e}")
601
- # Force garbage collection and skip chunk
602
- import gc
603
- gc.collect()
604
- if torch.cuda.is_available():
605
- torch.cuda.empty_cache()
606
- raw_output = f"Memory error in chunk {chunk_index+1}: {str(e)}. Skipping..."
607
- qa_pairs = None
608
- else:
609
- # For other errors, just log and continue
610
- print(f"Error processing chunk {chunk_index+1}: {e}")
611
- raw_output = f"Error in chunk {chunk_index+1}: {str(e)}"
612
- qa_pairs = None
613
-
614
- if qa_pairs:
615
- all_qa_pairs.extend(qa_pairs)
616
- all_raw_outputs.append(raw_output)
617
-
618
- # Check RAM usage after processing this chunk
619
- current_ram = get_process_memory_usage()
620
- print(f"RAM after chunk {chunk_index+1}: {current_ram:.2f}GB")
621
-
622
- # Do a thorough cleanup after each batch
623
- if torch.cuda.is_available():
624
- torch.cuda.empty_cache()
625
-
626
- # Force garbage collection between batches
627
- import gc
628
- gc.collect()
629
-
630
- # Check if we need to abort due to memory constraints
631
- current_ram = get_process_memory_usage()
632
- if current_ram > MAX_RAM_GB:
633
- print(f"WARNING: Exceeding RAM limit ({current_ram:.2f}GB). Stopping further processing.")
634
- if progress is not None:
635
- progress(1.0, f"Stopped early due to high memory usage ({current_ram:.2f}GB)")
636
- break
637
-
638
- if progress is not None:
639
- progress(1.0, "Finished processing")
640
-
641
- # Final cache clear and garbage collection
642
- if torch.cuda.is_available():
643
- torch.cuda.empty_cache()
644
- import gc
645
- gc.collect()
646
-
647
- if not all_qa_pairs:
648
- return None, "Failed to generate Q&A pairs", "\n\n".join(all_raw_outputs), "No data generated"
649
-
650
- # Save data to file
651
- filename = save_data(
652
- all_qa_pairs,
653
- output_file_format,
654
- "qa_dataset"
655
- )
656
-
657
- # Format for display
658
- formatted_data = format_data_preview(all_qa_pairs)
659
-
660
- # Final memory report
661
- final_ram = get_process_memory_usage()
662
- print(f"Final RAM usage: {final_ram:.2f}GB")
663
-
664
- return all_qa_pairs, formatted_data, "\n\n".join(all_raw_outputs), f"Data saved to {filename}"
665
- except Exception as e:
666
- error_msg = f"Error processing PDF: {str(e)}"
667
- print(error_msg)
668
- import traceback
669
- print(traceback.format_exc())
670
- return None, error_msg, "", "Processing failed"
671
-
672
- # Set up the Gradio interface
673
- def create_interface():
674
- with gr.Blocks(title="PDF Q&A Dataset Generator") as app:
675
- gr.Markdown("# 📚 PDF Q&A Dataset Generator")
676
- gr.Markdown("""
677
- Generate question & answer datasets from PDF documents using instruction-tuned language models.
678
- Perfect for creating educational resources, quiz materials, or training data for Q&A systems.
679
- """)
680
-
681
- with gr.Tabs() as tabs:
682
- with gr.TabItem("Generate Q&A Dataset"):
683
- with gr.Row():
684
- with gr.Column(scale=1):
685
- pdf_file = gr.File(
686
- label="Upload PDF",
687
- file_types=[".pdf"],
688
- type="binary"
689
- )
690
-
691
- model_dropdown = gr.Dropdown(
692
- choices=load_models(),
693
- value=DEFAULT_MODEL,
694
- label="Model"
695
- )
696
-
697
- num_questions = gr.Slider(
698
- minimum=1,
699
- maximum=5,
700
- value=3,
701
- step=1,
702
- label="Questions per Section"
703
- )
704
-
705
- include_tags = gr.Checkbox(
706
- value=True,
707
- label="Include Tags"
708
- )
709
-
710
- include_difficulty = gr.Checkbox(
711
- value=True,
712
- label="Include Difficulty Levels"
713
- )
714
-
715
- output_file_format = gr.Radio(
716
- choices=["json", "csv", "excel"],
717
- value="json",
718
- label="Save File Format"
719
- )
720
-
721
- generate_btn = gr.Button("Generate Q&A Dataset", variant="primary")
722
-
723
- progress_bar = gr.Progress()
724
-
725
- with gr.Column(scale=2):
726
- with gr.Tab("Parsed Data"):
727
- parsed_data_output = gr.JSON(label="Generated Q&A Pairs")
728
- formatted_data_output = gr.Textbox(
729
- label="Formatted Preview",
730
- lines=15
731
- )
732
-
733
- with gr.Tab("Raw Output"):
734
- raw_output = gr.Textbox(
735
- label="Raw Model Output",
736
- lines=15
737
- )
738
-
739
- file_output = gr.Textbox(label="File Output")
740
-
741
- with gr.TabItem("Documentation"):
742
- gr.Markdown("""
743
- ## How to Use
744
-
745
- 1. **Upload a PDF**: Select a PDF document containing the content you want to generate questions from.
746
- 2. **Select a model**: Choose an instruction-tuned language model from the dropdown.
747
- 3. **Configure settings**:
748
- - Set the number of questions to generate per text section
749
- - Choose whether to include tags and difficulty levels
750
- - Select your preferred output file format
751
- 4. **Generate dataset**: Click the "Generate Q&A Dataset" button to create your dataset.
752
-
753
- ## About This App
754
-
755
- This app uses instruction-tuned language models to generate question and answer pairs from PDF documents. It:
756
-
757
- 1. Extracts text from the uploaded PDF
758
- 2. Splits the text into manageable chunks
759
- 3. Generates questions, answers, tags, and difficulty levels for each chunk
760
- 4. Combines all Q&A pairs into a comprehensive dataset
761
-
762
- ### Features:
763
- - Automatic text extraction from PDFs
764
- - Smart text chunking to maintain context
765
- - Customizable number of questions per chunk
766
- - Optional tagging and difficulty classification
767
- - Multiple output formats (JSON, CSV, Excel)
768
-
769
- ### Use Cases:
770
- - Create educational resources and quiz materials
771
- - Generate training data for Q&A systems
772
- - Build flashcard datasets for studying
773
- - Develop content for educational applications
774
- """)
775
-
776
- with gr.TabItem("Status"):
777
- gr.Markdown("""
778
- ## System Status
779
-
780
- This app runs on CPU mode. Some larger models might be slower to load and generate content.
781
- If you encounter any issues with a specific model, try switching to a smaller model like `databricks/dolly-v2-3b`.
782
-
783
- ### Troubleshooting
784
-
785
- - If the app seems unresponsive after clicking "Generate", please be patient - model loading may take time.
786
- - If you get an error about model loading, try refreshing the page and selecting a different model.
787
- - Not all PDFs can be properly processed - if text extraction fails, try with a different PDF.
788
- """)
789
-
790
- # Event handler for generate button
791
- generate_btn.click(
792
- process_pdf_generate_qa,
793
- inputs=[
794
- pdf_file,
795
- model_dropdown,
796
- num_questions,
797
- include_tags,
798
- include_difficulty,
799
- output_file_format
800
- ],
801
- outputs=[parsed_data_output, formatted_data_output, raw_output, file_output],
802
- show_progress=True
803
- )
804
-
805
- return app
806
-
807
- # Export the app for Hugging Face Spaces
808
- app = create_interface()
809
-
810
- # Launch the app depending on the environment
811
- if __name__ == "__main__":
812
- app.launch()
813
-
 
1
+ import os
2
+ import json
3
+ import pandas as pd
4
+ import gradio as gr
5
+ import spaces
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer
7
+ import torch
8
+ import csv
9
+ import yaml
10
+ from typing import List, Dict, Any
11
+ import random
12
+ from pypdf import PdfReader
13
+ import re
14
+ import tempfile
15
+ from huggingface_hub import HfApi
16
+
17
+ # Configuration
18
+ DEFAULT_MODEL = "tiiuae/falcon-7b-instruct" # Use Falcon-7B as the default model
19
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Try to use CUDA if available
20
+ MAX_NEW_TOKENS = 512
21
+ TEMPERATURE = 0.7
22
+ HF_TOKEN = os.environ.get("HF_TOKEN") if os.environ.get("HF_TOKEN") else None # Get token from environment variables
23
+ MAX_RAM_GB = 45 # Set maximum RAM usage to 45GB (below the 70GB limit)
24
+
25
+ # Create offload folder for model memory management
26
+ os.makedirs("offload_folder", exist_ok=True)
27
+
28
+ # Setup RAM monitoring
29
+ def get_process_memory_usage():
30
+ """Get the current memory usage of this process in GB"""
31
+ import psutil
32
+ process = psutil.Process(os.getpid())
33
+ return process.memory_info().rss / (1024 * 1024 * 1024) # Convert to GB
34
+
35
+ class PdfExtractor:
36
+ """Extract text content from PDF files"""
37
+
38
+ @staticmethod
39
+ def extract_text_from_pdf(pdf_file):
40
+ """Extract text from a PDF file"""
41
+ try:
42
+ reader = PdfReader(pdf_file)
43
+ text = ""
44
+
45
+ for page in reader.pages:
46
+ text += page.extract_text() + "\n"
47
+
48
+ return text
49
+ except Exception as e:
50
+ print(f"Error extracting text from PDF: {e}")
51
+ return None
52
+
53
+ @staticmethod
54
+ def clean_text(text):
55
+ """Clean and preprocess extracted text"""
56
+ if not text:
57
+ return ""
58
+
59
+ # Replace multiple newlines with single newline
60
+ text = re.sub(r'\n+', '\n', text)
61
+
62
+ # Replace multiple spaces with single space
63
+ text = re.sub(r'\s+', ' ', text)
64
+
65
+ return text.strip()
66
+
67
+ @staticmethod
68
+ def chunk_text(text, max_chunk_size=1000, overlap=100):
69
+ """Split text into chunks of specified size with overlap"""
70
+ if not text:
71
+ return []
72
+
73
+ chunks = []
74
+ start = 0
75
+ text_length = len(text)
76
+
77
+ while start < text_length:
78
+ end = min(start + max_chunk_size, text_length)
79
+
80
+ # If we're not at the end, try to break at a sentence or paragraph
81
+ if end < text_length:
82
+ # Look for sentence breaks (period, question mark, exclamation mark followed by space)
83
+ sentence_break = max(
84
+ text.rfind('. ', start, end),
85
+ text.rfind('? ', start, end),
86
+ text.rfind('! ', start, end),
87
+ text.rfind('\n', start, end)
88
+ )
89
+
90
+ if sentence_break > start + max_chunk_size // 2:
91
+ end = sentence_break + 1
92
+
93
+ chunks.append(text[start:end].strip())
94
+ start = end - overlap # Create overlap with previous chunk
95
+
96
+ return chunks
97
+
98
+ class SyntheticDataGenerator:
99
+ def __init__(self, model_name=DEFAULT_MODEL):
100
+ self.model_name = model_name
101
+ self.model = None
102
+ self.tokenizer = None
103
+ self.load_model() # Load the model directly during initialization
104
+
105
+ def load_model(self):
106
+ """Load the specified model."""
107
+ # Clear CUDA cache if using GPU to prevent memory fragmentation
108
+ if torch.cuda.is_available():
109
+ torch.cuda.empty_cache()
110
+
111
+ try:
112
+ print(f"Loading model {self.model_name} on {DEVICE}...")
113
+
114
+ # Add token for authentication if available
115
+ tokenizer_kwargs = {}
116
+ model_kwargs = {
117
+ "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
118
+ "device_map": "auto" if torch.cuda.is_available() else None,
119
+ "low_cpu_mem_usage": True, # Added to reduce memory usage on CPU
120
+ "offload_folder": "offload_folder" # Add offload folder for large models
121
+ }
122
+
123
+ if HF_TOKEN:
124
+ tokenizer_kwargs["token"] = HF_TOKEN
125
+ model_kwargs["token"] = HF_TOKEN
126
+ print("Using Hugging Face token for authentication")
127
+
128
+ # Load tokenizer
129
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, **tokenizer_kwargs)
130
+
131
+ # Load the model
132
+ self.model = AutoModelForCausalLM.from_pretrained(
133
+ self.model_name,
134
+ **model_kwargs
135
+ )
136
+
137
+ # Ensure model is on the right device if not using device_map="auto"
138
+ if not torch.cuda.is_available():
139
+ self.model = self.model.to(DEVICE)
140
+
141
+ print(f"Model {self.model_name} loaded successfully on {DEVICE}")
142
+ except Exception as e:
143
+ print(f"Error loading model {self.model_name}: {e}")
144
+ self.model = None
145
+ self.tokenizer = None
146
+ raise
147
+
148
+ def generate_qa_prompt(self, context, num_questions=3, include_tags=True, difficulty_levels=True):
149
+ """Generate a prompt for creating Q&A pairs from context."""
150
+ tag_instruction = ""
151
+ if include_tags:
152
+ tag_instruction = "Add 1-3 tags for each question that categorize the topic or subject matter."
153
+
154
+ difficulty_instruction = ""
155
+ if difficulty_levels:
156
+ difficulty_instruction = "For each question, assign a difficulty level (easy, medium, or hard)."
157
+
158
+ prompt = f"""Task: Based on the following text, generate {num_questions} question and answer pairs that would be useful for comprehension testing or knowledge assessment.
159
+
160
+ CONTEXT:
161
+ {context}
162
+
163
+ For each question:
164
+ 1. Write a clear, specific question about the information in the text
165
+ 2. Provide the correct answer to the question, citing relevant details from the text
166
+ 3. {tag_instruction}
167
+ 4. {difficulty_instruction}
168
+
169
+ Format each Q&A pair as a JSON object with the following structure:
170
+ {{
171
+ "question": "The question text",
172
+ "answer": "The answer text",
173
+ "tags": ["tag1", "tag2"],
174
+ "difficulty": "easy/medium/hard"
175
+ }}
176
+
177
+ Return all Q&A pairs in a JSON array.
178
+ """
179
+ return prompt
180
+
181
+ def generate_data(self, prompt, num_samples=1):
182
+ """Generate synthetic data using the loaded model."""
183
+ if not self.model or not self.tokenizer:
184
+ return ["Error: Model not loaded properly. Please try again with a different model."]
185
+
186
+ outputs = []
187
+ for sample_idx in range(num_samples):
188
+ try:
189
+ # Clear CUDA cache before generating to free up memory
190
+ if torch.cuda.is_available():
191
+ torch.cuda.empty_cache()
192
+
193
+ # ZeroGPU errors often occur in generate() calls
194
+ # To mitigate this, try multiple approaches in sequence
195
+ inputs = self.tokenizer(prompt, return_tensors="pt").to(DEVICE)
196
+
197
+ try:
198
+ # First try: Standard generation with conservative settings
199
+ with torch.no_grad():
200
+ output = self.model.generate(
201
+ **inputs,
202
+ max_new_tokens=MAX_NEW_TOKENS,
203
+ temperature=TEMPERATURE,
204
+ do_sample=True,
205
+ pad_token_id=self.tokenizer.eos_token_id,
206
+ num_beams=1, # Use greedy decoding instead of beam search
207
+ early_stopping=True,
208
+ no_repeat_ngram_size=3 # Prevent repetition
209
+ )
210
+
211
+ decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
212
+ except (RuntimeError, Exception) as e:
213
+ if "CUDA" in str(e) or "GPU" in str(e) or "ZeroGPU" in str(e):
214
+ print(f"GPU error during generation: {e}")
215
+ print("Falling back to CPU generation...")
216
+
217
+ # Move everything to CPU
218
+ inputs = {k: v.to('cpu') for k, v in inputs.items()}
219
+
220
+ # Create CPU copy of the model if we were using GPU
221
+ if torch.cuda.is_available():
222
+ # Temporarily move model to CPU for this generation
223
+ model_cpu = self.model.to('cpu')
224
+
225
+ with torch.no_grad():
226
+ output = model_cpu.generate(
227
+ **inputs,
228
+ max_new_tokens=MAX_NEW_TOKENS,
229
+ temperature=TEMPERATURE,
230
+ do_sample=True,
231
+ pad_token_id=self.tokenizer.eos_token_id,
232
+ num_return_sequences=1,
233
+ max_length=MAX_NEW_TOKENS + inputs['input_ids'].shape[1]
234
+ )
235
+ decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
236
+
237
+ # Move model back to CUDA for future calls
238
+ self.model = self.model.to(DEVICE)
239
+ else:
240
+ # Already on CPU, try with reduced parameters
241
+ with torch.no_grad():
242
+ output = self.model.generate(
243
+ **inputs,
244
+ max_new_tokens=min(256, MAX_NEW_TOKENS), # Reduce token count
245
+ temperature=0.5, # Lower temperature
246
+ do_sample=False, # No sampling
247
+ num_return_sequences=1,
248
+ pad_token_id=self.tokenizer.eos_token_id
249
+ )
250
+ decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
251
+ else:
252
+ # Re-raise non-CUDA errors
253
+ raise
254
+
255
+ # Extract only the generated part (remove prompt)
256
+ prompt_text = self.tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
257
+ generated_text = decoded_output[len(prompt_text):].strip()
258
+ outputs.append(generated_text)
259
+
260
+ # Clear CUDA cache between samples
261
+ if torch.cuda.is_available():
262
+ torch.cuda.empty_cache()
263
+
264
+ except Exception as e:
265
+ error_msg = f"Error generating sample {sample_idx+1}: {str(e)}"
266
+ print(error_msg)
267
+ outputs.append(f"Error: {error_msg}")
268
+
269
+ return outputs
270
+
271
+ def parse_json_data(self, generated_text):
272
+ """Extract and parse JSON from generated text."""
273
+ try:
274
+ # Find JSON-like content (between [ and ])
275
+ start_idx = generated_text.find('[')
276
+ end_idx = generated_text.rfind(']') + 1
277
+
278
+ if start_idx >= 0 and end_idx > start_idx:
279
+ json_str = generated_text[start_idx:end_idx]
280
+ return json.loads(json_str)
281
+
282
+ # Try to find single object format
283
+ start_idx = generated_text.find('{')
284
+ end_idx = generated_text.rfind('}') + 1
285
+
286
+ if start_idx >= 0 and end_idx > start_idx:
287
+ json_str = generated_text[start_idx:end_idx]
288
+ return json.loads(json_str)
289
+
290
+ print(f"Could not find JSON content in: {generated_text}")
291
+ return None
292
+ except json.JSONDecodeError as e:
293
+ print(f"JSON parse error: {e}")
294
+ print(f"Problematic text: {generated_text}")
295
+
296
+ # Try to find and fix common JSON formatting errors
297
+ try:
298
+ # Replace single quotes with double quotes
299
+ json_str = generated_text[start_idx:end_idx].replace("'", "\"")
300
+ return json.loads(json_str)
301
+ except:
302
+ pass
303
+
304
+ # If still failing, try to extract individual JSON objects
305
+ try:
306
+ pattern = r'\{[^{}]*\}'
307
+ matches = re.findall(pattern, generated_text)
308
+ if matches:
309
+ results = []
310
+ for match in matches:
311
+ try:
312
+ # Replace single quotes with double quotes
313
+ fixed_match = match.replace("'", "\"")
314
+ obj = json.loads(fixed_match)
315
+ results.append(obj)
316
+ except:
317
+ continue
318
+ if results:
319
+ return results
320
+ except:
321
+ pass
322
+
323
+ return None
324
+
325
+ def generate_qa_from_pdf_chunk(self, chunk, num_questions=3, include_tags=True, difficulty_levels=True):
326
+ """Generate Q&A pairs from a PDF text chunk."""
327
+ if not self.model or not self.tokenizer:
328
+ return [], "Error: Model not loaded properly. Please try again with a different model."
329
+
330
+ if not chunk or len(chunk.strip()) < 100: # Skip very small chunks
331
+ return [], "Chunk too small to generate meaningful Q&A pairs."
332
+
333
+ prompt = self.generate_qa_prompt(chunk, num_questions, include_tags, difficulty_levels)
334
+ raw_outputs = self.generate_data(prompt, num_samples=1)
335
+ raw_output = raw_outputs[0]
336
+
337
+ parsed_data = self.parse_json_data(raw_output)
338
+
339
+ # Ensure parsed data is a list
340
+ if parsed_data and isinstance(parsed_data, dict):
341
+ parsed_data = [parsed_data]
342
+
343
+ # Return both the parsed data and raw output for debugging
344
+ return parsed_data, raw_output
345
+
346
+ def format_data_preview(data):
347
+ """Format the data for preview in the UI."""
348
+ if isinstance(data, list):
349
+ if len(data) > 0 and isinstance(data[0], dict):
350
+ # Convert list of dicts to DataFrame for better display
351
+ return pd.DataFrame(data).to_string()
352
+ else:
353
+ return json.dumps(data, indent=2)
354
+ elif isinstance(data, dict):
355
+ return json.dumps(data, indent=2)
356
+ else:
357
+ return str(data)
358
+
359
+ def save_data(data, format, filename_prefix):
360
+ """Save data to a file in the specified format."""
361
+ os.makedirs("synthetic_data", exist_ok=True)
362
+ timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
363
+ filename = f"synthetic_data/{filename_prefix}_{timestamp}"
364
+
365
+ if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
366
+ df = pd.DataFrame(data)
367
+
368
+ if format.lower() == "csv":
369
+ full_filename = f"{filename}.csv"
370
+ df.to_csv(full_filename, index=False)
371
+ elif format.lower() == "json":
372
+ full_filename = f"{filename}.json"
373
+ with open(full_filename, "w") as f:
374
+ json.dump(data, f, indent=2)
375
+ elif format.lower() == "excel":
376
+ full_filename = f"{filename}.xlsx"
377
+ df.to_excel(full_filename, index=False)
378
+ else:
379
+ full_filename = f"{filename}.txt"
380
+ with open(full_filename, "w") as f:
381
+ f.write(str(data))
382
+ else:
383
+ full_filename = f"{filename}.{format.lower()}"
384
+ with open(full_filename, "w") as f:
385
+ if format.lower() == "json":
386
+ json.dump(data, f, indent=2)
387
+ else:
388
+ f.write(str(data))
389
+
390
+ return full_filename
391
+
392
+ def load_models():
393
+ """Return a list of available models."""
394
+ return [
395
+ "tiiuae/falcon-7b-instruct"
396
+ ]
397
+
398
+ @spaces.GPU
399
+ def process_pdf_generate_qa(pdf_file, model_name, num_questions_per_chunk, include_tags, include_difficulty, output_file_format, progress=None):
400
+ """Process a PDF file and generate Q&A pairs from its content."""
401
+ if pdf_file is None:
402
+ return None, "Error: No PDF file uploaded", "", "No file provided"
403
+
404
+ try:
405
+ # Check RAM usage at start
406
+ current_ram_usage = get_process_memory_usage()
407
+ print(f"Starting RAM usage: {current_ram_usage:.2f}GB")
408
+
409
+ # Clear CUDA cache before starting
410
+ if torch.cuda.is_available():
411
+ torch.cuda.empty_cache()
412
+
413
+ # Initialize extractor and generator
414
+ extractor = PdfExtractor()
415
+ generator = SyntheticDataGenerator(model_name)
416
+
417
+ # Wrap model loading in try-except to handle errors
418
+ try:
419
+ load_success = generator.load_model()
420
+ if not load_success:
421
+ return None, "Error: Failed to load the model. Please try again with a different model.", "", "Model loading failed"
422
+ except Exception as e:
423
+ if "ZeroGPU" in str(e) or "GPU task aborted" in str(e) or "CUDA" in str(e):
424
+ print(f"GPU error during model loading: {e}. Trying with a smaller model...")
425
+ # If we get a ZeroGPU error, immediately try the smallest model
426
+ generator.model_name = "tiiuae/falcon-7b-instruct" # Use default model as fallback
427
+ load_success = generator.load_model()
428
+ if not load_success:
429
+ return None, "Error: Failed to load any model even after fallback. Please try again later.", "", "Model loading failed"
430
+ else:
431
+ # Re-raise other errors
432
+ raise
433
+
434
+ # Check RAM usage after model loading
435
+ ram_after_model = get_process_memory_usage()
436
+ print(f"RAM usage after model loading: {ram_after_model:.2f}GB")
437
+
438
+ # Save PDF temporarily if it's a file object
439
+ if hasattr(pdf_file, 'name'):
440
+ # It's already a file path
441
+ pdf_path = pdf_file.name
442
+ else:
443
+ # Create a temporary file
444
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
445
+ tmp.write(pdf_file)
446
+ pdf_path = tmp.name
447
+
448
+ # Extract text from PDF
449
+ pdf_text = extractor.extract_text_from_pdf(pdf_path)
450
+
451
+ if not pdf_text:
452
+ return None, "Failed to extract text from PDF", "", "No data generated"
453
+
454
+ # Clean and chunk the text - reduce chunk size to use less memory
455
+ cleaned_text = extractor.clean_text(pdf_text)
456
+ chunks = extractor.chunk_text(cleaned_text, max_chunk_size=400, overlap=30)
457
+
458
+ # Check RAM after PDF processing
459
+ ram_after_pdf = get_process_memory_usage()
460
+ print(f"RAM usage after PDF processing: {ram_after_pdf:.2f}GB, found {len(chunks)} chunks")
461
+
462
+ # If we're approaching the RAM limit already, reduce batch size
463
+ batch_size = 3 # Default
464
+ if ram_after_pdf > MAX_RAM_GB * 0.7: # If already using 70% of our limit
465
+ batch_size = 1 # Process one chunk at a time
466
+ print(f"High RAM usage detected ({ram_after_pdf:.2f}GB), reducing batch size to 1")
467
+ elif ram_after_pdf > MAX_RAM_GB * 0.5: # If using 50% of our limit
468
+ batch_size = 2 # Process two chunks at a time
469
+ print(f"Moderate RAM usage detected ({ram_after_pdf:.2f}GB), reducing batch size to 2")
470
+
471
+ # Generate Q&A pairs for each chunk
472
+ all_qa_pairs = []
473
+ all_raw_outputs = []
474
+
475
+ total_chunks = len(chunks)
476
+
477
+ # Process chunks in smaller batches to avoid memory buildup
478
+ for i in range(0, total_chunks, batch_size):
479
+ # Get the current batch of chunks
480
+ batch_chunks = chunks[i:min(i+batch_size, total_chunks)]
481
+
482
+ # Process each chunk in the batch
483
+ for j, chunk in enumerate(batch_chunks):
484
+ chunk_index = i + j
485
+
486
+ if progress is not None:
487
+ progress(chunk_index / total_chunks, f"Processing chunk {chunk_index+1}/{total_chunks}")
488
+
489
+ # Check if we're approaching RAM limit
490
+ current_ram = get_process_memory_usage()
491
+ if current_ram > MAX_RAM_GB * 0.9: # Over 90% of our limit
492
+ print(f"WARNING: High RAM usage detected: {current_ram:.2f}GB - force releasing memory")
493
+ import gc
494
+ gc.collect() # Force garbage collection
495
+ if torch.cuda.is_available():
496
+ torch.cuda.empty_cache()
497
+
498
+ # If still too high after garbage collection, abort batch processing
499
+ current_ram = get_process_memory_usage()
500
+ if current_ram > MAX_RAM_GB * 0.95: # Still dangerously high
501
+ print(f"CRITICAL: RAM usage too high ({current_ram:.2f}GB), stopping processing")
502
+ break
503
+
504
+ # Clear CUDA cache between chunks
505
+ if torch.cuda.is_available():
506
+ torch.cuda.empty_cache()
507
+
508
+ try:
509
+ qa_pairs, raw_output = generator.generate_qa_from_pdf_chunk(
510
+ chunk,
511
+ num_questions=num_questions_per_chunk,
512
+ include_tags=include_tags,
513
+ difficulty_levels=include_difficulty
514
+ )
515
+ except Exception as e:
516
+ error_type = str(e)
517
+ if "CUDA" in error_type or "GPU" in error_type or "ZeroGPU" in error_type:
518
+ print(f"GPU error during generation for chunk {chunk_index+1}: {e}")
519
+ # Fall back to CPU for this specific generation
520
+ raw_output = f"Error in chunk {chunk_index+1}: {str(e)}. Skipping..."
521
+ qa_pairs = None
522
+ elif "memory" in error_type.lower() or "ram" in error_type.lower():
523
+ print(f"Memory error processing chunk {chunk_index+1}: {e}")
524
+ # Force garbage collection and skip chunk
525
+ import gc
526
+ gc.collect()
527
+ if torch.cuda.is_available():
528
+ torch.cuda.empty_cache()
529
+ raw_output = f"Memory error in chunk {chunk_index+1}: {str(e)}. Skipping..."
530
+ qa_pairs = None
531
+ else:
532
+ # For other errors, just log and continue
533
+ print(f"Error processing chunk {chunk_index+1}: {e}")
534
+ raw_output = f"Error in chunk {chunk_index+1}: {str(e)}"
535
+ qa_pairs = None
536
+
537
+ if qa_pairs:
538
+ all_qa_pairs.extend(qa_pairs)
539
+ all_raw_outputs.append(raw_output)
540
+
541
+ # Check RAM usage after processing this chunk
542
+ current_ram = get_process_memory_usage()
543
+ print(f"RAM after chunk {chunk_index+1}: {current_ram:.2f}GB")
544
+
545
+ # Do a thorough cleanup after each batch
546
+ if torch.cuda.is_available():
547
+ torch.cuda.empty_cache()
548
+
549
+ # Force garbage collection between batches
550
+ import gc
551
+ gc.collect()
552
+
553
+ # Check if we need to abort due to memory constraints
554
+ current_ram = get_process_memory_usage()
555
+ if current_ram > MAX_RAM_GB:
556
+ print(f"WARNING: Exceeding RAM limit ({current_ram:.2f}GB). Stopping further processing.")
557
+ if progress is not None:
558
+ progress(1.0, f"Stopped early due to high memory usage ({current_ram:.2f}GB)")
559
+ break
560
+
561
+ if progress is not None:
562
+ progress(1.0, "Finished processing")
563
+
564
+ # Final cache clear and garbage collection
565
+ if torch.cuda.is_available():
566
+ torch.cuda.empty_cache()
567
+ import gc
568
+ gc.collect()
569
+
570
+ if not all_qa_pairs:
571
+ return None, "Failed to generate Q&A pairs", "\n\n".join(all_raw_outputs), "No data generated"
572
+
573
+ # Save data to file
574
+ filename = save_data(
575
+ all_qa_pairs,
576
+ output_file_format,
577
+ "qa_dataset"
578
+ )
579
+
580
+ # Format for display
581
+ formatted_data = format_data_preview(all_qa_pairs)
582
+
583
+ # Final memory report
584
+ final_ram = get_process_memory_usage()
585
+ print(f"Final RAM usage: {final_ram:.2f}GB")
586
+
587
+ return all_qa_pairs, formatted_data, "\n\n".join(all_raw_outputs), f"Data saved to {filename}"
588
+ except Exception as e:
589
+ error_msg = f"Error processing PDF: {str(e)}"
590
+ print(error_msg)
591
+ import traceback
592
+ print(traceback.format_exc())
593
+ return None, error_msg, "", "Processing failed"
594
+
595
+ # Set up the Gradio interface
596
+ def create_interface():
597
+ with gr.Blocks(title="PDF Q&A Dataset Generator") as app:
598
+ gr.Markdown("# 📚 PDF Q&A Dataset Generator")
599
+ gr.Markdown("""
600
+ Generate question & answer datasets from PDF documents using instruction-tuned language models.
601
+ Perfect for creating educational resources, quiz materials, or training data for Q&A systems.
602
+ """)
603
+
604
+ with gr.Tabs() as tabs:
605
+ with gr.TabItem("Generate Q&A Dataset"):
606
+ with gr.Row():
607
+ with gr.Column(scale=1):
608
+ pdf_file = gr.File(
609
+ label="Upload PDF",
610
+ file_types=[".pdf"],
611
+ type="binary"
612
+ )
613
+
614
+ model_dropdown = gr.Dropdown(
615
+ choices=load_models(),
616
+ value=DEFAULT_MODEL,
617
+ label="Model"
618
+ )
619
+
620
+ num_questions = gr.Slider(
621
+ minimum=1,
622
+ maximum=5,
623
+ value=3,
624
+ step=1,
625
+ label="Questions per Section"
626
+ )
627
+
628
+ include_tags = gr.Checkbox(
629
+ value=True,
630
+ label="Include Tags"
631
+ )
632
+
633
+ include_difficulty = gr.Checkbox(
634
+ value=True,
635
+ label="Include Difficulty Levels"
636
+ )
637
+
638
+ output_file_format = gr.Radio(
639
+ choices=["json", "csv", "excel"],
640
+ value="json",
641
+ label="Save File Format"
642
+ )
643
+
644
+ generate_btn = gr.Button("Generate Q&A Dataset", variant="primary")
645
+
646
+ progress_bar = gr.Progress()
647
+
648
+ with gr.Column(scale=2):
649
+ with gr.Tab("Parsed Data"):
650
+ parsed_data_output = gr.JSON(label="Generated Q&A Pairs")
651
+ formatted_data_output = gr.Textbox(
652
+ label="Formatted Preview",
653
+ lines=15
654
+ )
655
+
656
+ with gr.Tab("Raw Output"):
657
+ raw_output = gr.Textbox(
658
+ label="Raw Model Output",
659
+ lines=15
660
+ )
661
+
662
+ file_output = gr.Textbox(label="File Output")
663
+
664
+ with gr.TabItem("Documentation"):
665
+ gr.Markdown("""
666
+ ## How to Use
667
+
668
+ 1. **Upload a PDF**: Select a PDF document containing the content you want to generate questions from.
669
+ 2. **Select a model**: Choose an instruction-tuned language model from the dropdown.
670
+ 3. **Configure settings**:
671
+ - Set the number of questions to generate per text section
672
+ - Choose whether to include tags and difficulty levels
673
+ - Select your preferred output file format
674
+ 4. **Generate dataset**: Click the "Generate Q&A Dataset" button to create your dataset.
675
+
676
+ ## About This App
677
+
678
+ This app uses instruction-tuned language models to generate question and answer pairs from PDF documents. It:
679
+
680
+ 1. Extracts text from the uploaded PDF
681
+ 2. Splits the text into manageable chunks
682
+ 3. Generates questions, answers, tags, and difficulty levels for each chunk
683
+ 4. Combines all Q&A pairs into a comprehensive dataset
684
+
685
+ ### Features:
686
+ - Automatic text extraction from PDFs
687
+ - Smart text chunking to maintain context
688
+ - Customizable number of questions per chunk
689
+ - Optional tagging and difficulty classification
690
+ - Multiple output formats (JSON, CSV, Excel)
691
+
692
+ ### Use Cases:
693
+ - Create educational resources and quiz materials
694
+ - Generate training data for Q&A systems
695
+ - Build flashcard datasets for studying
696
+ - Develop content for educational applications
697
+ """)
698
+
699
+ with gr.TabItem("Status"):
700
+ gr.Markdown("""
701
+ ## System Status
702
+
703
+ This app runs on CPU mode. Some larger models might be slower to load and generate content.
704
+ If you encounter any issues with a specific model, try switching to a smaller model like `tiiuae/falcon-7b-instruct`.
705
+
706
+ ### Troubleshooting
707
+
708
+ - If the app seems unresponsive after clicking "Generate", please be patient - model loading may take time.
709
+ - If you get an error about model loading, try refreshing the page and selecting a different model.
710
+ - Not all PDFs can be properly processed - if text extraction fails, try with a different PDF.
711
+ """)
712
+
713
+ # Event handler for generate button
714
+ generate_btn.click(
715
+ process_pdf_generate_qa,
716
+ inputs=[
717
+ pdf_file,
718
+ model_dropdown,
719
+ num_questions,
720
+ include_tags,
721
+ include_difficulty,
722
+ output_file_format
723
+ ],
724
+ outputs=[parsed_data_output, formatted_data_output, raw_output, file_output],
725
+ show_progress=True
726
+ )
727
+
728
+ return app
729
+
730
+ # Export the app for Hugging Face Spaces
731
+ app = create_interface()
732
+
733
+ # Launch the app depending on the environment
734
+ if __name__ == "__main__":
735
+ app.launch()
736
+