File size: 31,601 Bytes
9d0ffca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
import os
import json
import pandas as pd
import gradio as gr
import spaces 
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import csv
import yaml
from typing import List, Dict, Any
import random
from pypdf import PdfReader
import re
import tempfile
from huggingface_hub import HfApi

# Configuration
DEFAULT_MODEL = "tiiuae/falcon-7b-instruct"  # Use Falcon-7B as the default model
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"  # Try to use CUDA if available
MAX_NEW_TOKENS = 512
TEMPERATURE = 0.7
HF_TOKEN = os.environ.get("HF_TOKEN") if os.environ.get("HF_TOKEN") else None  # Get token from environment variables
MAX_RAM_GB = 45  # Set maximum RAM usage to 45GB (below the 70GB limit)

# Create offload folder for model memory management
os.makedirs("offload_folder", exist_ok=True)

# Setup RAM monitoring
def get_process_memory_usage():
    """Get the current memory usage of this process in GB"""
    import psutil
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / (1024 * 1024 * 1024)  # Convert to GB

class PdfExtractor:
    """Extract text content from PDF files"""
    
    @staticmethod
    def extract_text_from_pdf(pdf_file):
        """Extract text from a PDF file"""
        try:
            reader = PdfReader(pdf_file)
            text = ""
            
            for page in reader.pages:
                text += page.extract_text() + "\n"
                
            return text
        except Exception as e:
            print(f"Error extracting text from PDF: {e}")
            return None
    
    @staticmethod
    def clean_text(text):
        """Clean and preprocess extracted text"""
        if not text:
            return ""
            
        # Replace multiple newlines with single newline
        text = re.sub(r'\n+', '\n', text)
        
        # Replace multiple spaces with single space
        text = re.sub(r'\s+', ' ', text)
        
        return text.strip()
    
    @staticmethod
    def chunk_text(text, max_chunk_size=1000, overlap=100):
        """Split text into chunks of specified size with overlap"""
        if not text:
            return []
            
        chunks = []
        start = 0
        text_length = len(text)
        
        while start < text_length:
            end = min(start + max_chunk_size, text_length)
            
            # If we're not at the end, try to break at a sentence or paragraph
            if end < text_length:
                # Look for sentence breaks (period, question mark, exclamation mark followed by space)
                sentence_break = max(
                    text.rfind('. ', start, end),
                    text.rfind('? ', start, end),
                    text.rfind('! ', start, end),
                    text.rfind('\n', start, end)
                )
                
                if sentence_break > start + max_chunk_size // 2:
                    end = sentence_break + 1
            
            chunks.append(text[start:end].strip())
            start = end - overlap  # Create overlap with previous chunk
            
        return chunks

class SyntheticDataGenerator:
    def __init__(self, model_name=DEFAULT_MODEL):
        self.model_name = model_name
        self.model = None
        self.tokenizer = None
        self.load_model()  # Load the model directly during initialization

    def load_model(self):
        """Load the specified model."""
        # Clear CUDA cache if using GPU to prevent memory fragmentation
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        try:
            print(f"Loading model {self.model_name} on {DEVICE}...")

            # Add token for authentication if available
            tokenizer_kwargs = {}
            model_kwargs = {
                "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
                "device_map": "auto" if torch.cuda.is_available() else None,
                "low_cpu_mem_usage": True,  # Added to reduce memory usage on CPU
                "offload_folder": "offload_folder"  # Add offload folder for large models
            }

            if HF_TOKEN:
                tokenizer_kwargs["token"] = HF_TOKEN
                model_kwargs["token"] = HF_TOKEN
                print("Using Hugging Face token for authentication")

            # Load tokenizer
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, **tokenizer_kwargs)

            # Load the model
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                **model_kwargs
            )

            # Ensure model is on the right device if not using device_map="auto"
            if not torch.cuda.is_available():
                self.model = self.model.to(DEVICE)

            print(f"Model {self.model_name} loaded successfully on {DEVICE}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            self.model = None
            self.tokenizer = None
            raise

    def generate_qa_prompt(self, context, num_questions=3, include_tags=True, difficulty_levels=True):
        """Generate a prompt for creating Q&A pairs from context."""
        tag_instruction = ""
        if include_tags:
            tag_instruction = "Add 1-3 tags for each question that categorize the topic or subject matter."
            
        difficulty_instruction = ""
        if difficulty_levels:
            difficulty_instruction = "For each question, assign a difficulty level (easy, medium, or hard)."
        
        prompt = f"""Task: Based on the following text, generate {num_questions} question and answer pairs that would be useful for comprehension testing or knowledge assessment.

CONTEXT:
{context}

For each question:
1. Write a clear, specific question about the information in the text
2. Provide the correct answer to the question, citing relevant details from the text
3. {tag_instruction}
4. {difficulty_instruction}

Format each Q&A pair as a JSON object with the following structure:
{{
  "question": "The question text",
  "answer": "The answer text",
  "tags": ["tag1", "tag2"],
  "difficulty": "easy/medium/hard"
}}

Return all Q&A pairs in a JSON array.
"""
        return prompt
    
    def generate_data(self, prompt, num_samples=1):
        """Generate synthetic data using the loaded model."""
        if not self.model or not self.tokenizer:
            return ["Error: Model not loaded properly. Please try again with a different model."]
            
        outputs = []
        for sample_idx in range(num_samples):
            try:
                # Clear CUDA cache before generating to free up memory
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                    
                # ZeroGPU errors often occur in generate() calls
                # To mitigate this, try multiple approaches in sequence
                inputs = self.tokenizer(prompt, return_tensors="pt").to(DEVICE)
                
                try:
                    # First try: Standard generation with conservative settings
                    with torch.no_grad():
                        output = self.model.generate(
                            **inputs,
                            max_new_tokens=MAX_NEW_TOKENS,
                            temperature=TEMPERATURE,
                            do_sample=True,
                            pad_token_id=self.tokenizer.eos_token_id,
                            num_beams=1,  # Use greedy decoding instead of beam search
                            early_stopping=True,
                            no_repeat_ngram_size=3  # Prevent repetition
                        )
                    
                    decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
                except (RuntimeError, Exception) as e:
                    if "CUDA" in str(e) or "GPU" in str(e) or "ZeroGPU" in str(e):
                        print(f"GPU error during generation: {e}")
                        print("Falling back to CPU generation...")
                        
                        # Move everything to CPU
                        inputs = {k: v.to('cpu') for k, v in inputs.items()}
                        
                        # Create CPU copy of the model if we were using GPU
                        if torch.cuda.is_available():
                            # Temporarily move model to CPU for this generation
                            model_cpu = self.model.to('cpu')
                            
                            with torch.no_grad():
                                output = model_cpu.generate(
                                    **inputs,
                                    max_new_tokens=MAX_NEW_TOKENS,
                                    temperature=TEMPERATURE,
                                    do_sample=True,
                                    pad_token_id=self.tokenizer.eos_token_id,
                                    num_return_sequences=1,
                                    max_length=MAX_NEW_TOKENS + inputs['input_ids'].shape[1]
                                )
                            decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
                            
                            # Move model back to CUDA for future calls
                            self.model = self.model.to(DEVICE)
                        else:
                            # Already on CPU, try with reduced parameters
                            with torch.no_grad():
                                output = self.model.generate(
                                    **inputs,
                                    max_new_tokens=min(256, MAX_NEW_TOKENS),  # Reduce token count
                                    temperature=0.5,  # Lower temperature
                                    do_sample=False,  # No sampling
                                    num_return_sequences=1,
                                    pad_token_id=self.tokenizer.eos_token_id
                                )
                            decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
                    else:
                        # Re-raise non-CUDA errors
                        raise
                        
                # Extract only the generated part (remove prompt)
                prompt_text = self.tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
                generated_text = decoded_output[len(prompt_text):].strip()
                outputs.append(generated_text)
                
                # Clear CUDA cache between samples
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                    
            except Exception as e:
                error_msg = f"Error generating sample {sample_idx+1}: {str(e)}"
                print(error_msg)
                outputs.append(f"Error: {error_msg}")
        
        return outputs
    
    def parse_json_data(self, generated_text):
        """Extract and parse JSON from generated text."""
        try:
            # Find JSON-like content (between [ and ])
            start_idx = generated_text.find('[')
            end_idx = generated_text.rfind(']') + 1
            
            if start_idx >= 0 and end_idx > start_idx:
                json_str = generated_text[start_idx:end_idx]
                return json.loads(json_str)
            
            # Try to find single object format
            start_idx = generated_text.find('{')
            end_idx = generated_text.rfind('}') + 1
            
            if start_idx >= 0 and end_idx > start_idx:
                json_str = generated_text[start_idx:end_idx]
                return json.loads(json_str)
                
            print(f"Could not find JSON content in: {generated_text}")
            return None
        except json.JSONDecodeError as e:
            print(f"JSON parse error: {e}")
            print(f"Problematic text: {generated_text}")
            
            # Try to find and fix common JSON formatting errors
            try:
                # Replace single quotes with double quotes
                json_str = generated_text[start_idx:end_idx].replace("'", "\"")
                return json.loads(json_str)
            except:
                pass
                
            # If still failing, try to extract individual JSON objects
            try:
                pattern = r'\{[^{}]*\}'
                matches = re.findall(pattern, generated_text)
                if matches:
                    results = []
                    for match in matches:
                        try:
                            # Replace single quotes with double quotes
                            fixed_match = match.replace("'", "\"")
                            obj = json.loads(fixed_match)
                            results.append(obj)
                        except:
                            continue
                    if results:
                        return results
            except:
                pass
                
            return None

    def generate_qa_from_pdf_chunk(self, chunk, num_questions=3, include_tags=True, difficulty_levels=True):
        """Generate Q&A pairs from a PDF text chunk."""
        if not self.model or not self.tokenizer:
            return [], "Error: Model not loaded properly. Please try again with a different model."
            
        if not chunk or len(chunk.strip()) < 100:  # Skip very small chunks
            return [], "Chunk too small to generate meaningful Q&A pairs."
            
        prompt = self.generate_qa_prompt(chunk, num_questions, include_tags, difficulty_levels)
        raw_outputs = self.generate_data(prompt, num_samples=1)
        raw_output = raw_outputs[0]
        
        parsed_data = self.parse_json_data(raw_output)
        
        # Ensure parsed data is a list
        if parsed_data and isinstance(parsed_data, dict):
            parsed_data = [parsed_data]
            
        # Return both the parsed data and raw output for debugging
        return parsed_data, raw_output

def format_data_preview(data):
    """Format the data for preview in the UI."""
    if isinstance(data, list):
        if len(data) > 0 and isinstance(data[0], dict):
            # Convert list of dicts to DataFrame for better display
            return pd.DataFrame(data).to_string()
        else:
            return json.dumps(data, indent=2)
    elif isinstance(data, dict):
        return json.dumps(data, indent=2)
    else:
        return str(data)

def save_data(data, format, filename_prefix):
    """Save data to a file in the specified format."""
    os.makedirs("synthetic_data", exist_ok=True)
    timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
    filename = f"synthetic_data/{filename_prefix}_{timestamp}"
    
    if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
        df = pd.DataFrame(data)
        
        if format.lower() == "csv":
            full_filename = f"{filename}.csv"
            df.to_csv(full_filename, index=False)
        elif format.lower() == "json":
            full_filename = f"{filename}.json"
            with open(full_filename, "w") as f:
                json.dump(data, f, indent=2)
        elif format.lower() == "excel":
            full_filename = f"{filename}.xlsx"
            df.to_excel(full_filename, index=False)
        else:
            full_filename = f"{filename}.txt"
            with open(full_filename, "w") as f:
                f.write(str(data))
    else:
        full_filename = f"{filename}.{format.lower()}"
        with open(full_filename, "w") as f:
            if format.lower() == "json":
                json.dump(data, f, indent=2)
            else:
                f.write(str(data))
    
    return full_filename

def load_models():
    """Return a list of available models."""
    return [
        "tiiuae/falcon-7b-instruct"
    ]

@spaces.GPU
def process_pdf_generate_qa(pdf_file, model_name, num_questions_per_chunk, include_tags, include_difficulty, output_file_format, progress=None):
    """Process a PDF file and generate Q&A pairs from its content."""
    if pdf_file is None:
        return None, "Error: No PDF file uploaded", "", "No file provided"
        
    try:
        # Check RAM usage at start
        current_ram_usage = get_process_memory_usage()
        print(f"Starting RAM usage: {current_ram_usage:.2f}GB")
        
        # Clear CUDA cache before starting
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            
        # Initialize extractor and generator
        extractor = PdfExtractor()
        generator = SyntheticDataGenerator(model_name)
        
        # Wrap model loading in try-except to handle errors
        try:
            load_success = generator.load_model()
            if not load_success:
                return None, "Error: Failed to load the model. Please try again with a different model.", "", "Model loading failed"
        except Exception as e:
            if "ZeroGPU" in str(e) or "GPU task aborted" in str(e) or "CUDA" in str(e):
                print(f"GPU error during model loading: {e}. Trying with a smaller model...")
                # If we get a ZeroGPU error, immediately try the smallest model
                generator.model_name = "tiiuae/falcon-7b-instruct"  # Use default model as fallback
                load_success = generator.load_model()
                if not load_success:
                    return None, "Error: Failed to load any model even after fallback. Please try again later.", "", "Model loading failed"
            else:
                # Re-raise other errors
                raise
                
        # Check RAM usage after model loading
        ram_after_model = get_process_memory_usage()
        print(f"RAM usage after model loading: {ram_after_model:.2f}GB")
        
        # Save PDF temporarily if it's a file object
        if hasattr(pdf_file, 'name'):
            # It's already a file path
            pdf_path = pdf_file.name
        else:
            # Create a temporary file
            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
                tmp.write(pdf_file)
                pdf_path = tmp.name
        
        # Extract text from PDF
        pdf_text = extractor.extract_text_from_pdf(pdf_path)
        
        if not pdf_text:
            return None, "Failed to extract text from PDF", "", "No data generated"
        
        # Clean and chunk the text - reduce chunk size to use less memory
        cleaned_text = extractor.clean_text(pdf_text)
        chunks = extractor.chunk_text(cleaned_text, max_chunk_size=400, overlap=30)  
        
        # Check RAM after PDF processing
        ram_after_pdf = get_process_memory_usage()
        print(f"RAM usage after PDF processing: {ram_after_pdf:.2f}GB, found {len(chunks)} chunks")
        
        # If we're approaching the RAM limit already, reduce batch size
        batch_size = 3  # Default
        if ram_after_pdf > MAX_RAM_GB * 0.7:  # If already using 70% of our limit
            batch_size = 1  # Process one chunk at a time
            print(f"High RAM usage detected ({ram_after_pdf:.2f}GB), reducing batch size to 1")
        elif ram_after_pdf > MAX_RAM_GB * 0.5:  # If using 50% of our limit
            batch_size = 2  # Process two chunks at a time
            print(f"Moderate RAM usage detected ({ram_after_pdf:.2f}GB), reducing batch size to 2")
        
        # Generate Q&A pairs for each chunk
        all_qa_pairs = []
        all_raw_outputs = []
        
        total_chunks = len(chunks)
        
        # Process chunks in smaller batches to avoid memory buildup
        for i in range(0, total_chunks, batch_size):
            # Get the current batch of chunks
            batch_chunks = chunks[i:min(i+batch_size, total_chunks)]
            
            # Process each chunk in the batch
            for j, chunk in enumerate(batch_chunks):
                chunk_index = i + j
                
                if progress is not None:
                    progress(chunk_index / total_chunks, f"Processing chunk {chunk_index+1}/{total_chunks}")
                
                # Check if we're approaching RAM limit
                current_ram = get_process_memory_usage()
                if current_ram > MAX_RAM_GB * 0.9:  # Over 90% of our limit
                    print(f"WARNING: High RAM usage detected: {current_ram:.2f}GB - force releasing memory")
                    import gc
                    gc.collect()  # Force garbage collection
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()
                    
                    # If still too high after garbage collection, abort batch processing
                    current_ram = get_process_memory_usage()
                    if current_ram > MAX_RAM_GB * 0.95:  # Still dangerously high
                        print(f"CRITICAL: RAM usage too high ({current_ram:.2f}GB), stopping processing")
                        break
                
                # Clear CUDA cache between chunks
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                
                try:
                    qa_pairs, raw_output = generator.generate_qa_from_pdf_chunk(
                        chunk, 
                        num_questions=num_questions_per_chunk,
                        include_tags=include_tags,
                        difficulty_levels=include_difficulty
                    )
                except Exception as e:
                    error_type = str(e)
                    if "CUDA" in error_type or "GPU" in error_type or "ZeroGPU" in error_type:
                        print(f"GPU error during generation for chunk {chunk_index+1}: {e}")
                        # Fall back to CPU for this specific generation
                        raw_output = f"Error in chunk {chunk_index+1}: {str(e)}. Skipping..."
                        qa_pairs = None
                    elif "memory" in error_type.lower() or "ram" in error_type.lower():
                        print(f"Memory error processing chunk {chunk_index+1}: {e}")
                        # Force garbage collection and skip chunk
                        import gc
                        gc.collect()
                        if torch.cuda.is_available():
                            torch.cuda.empty_cache()
                        raw_output = f"Memory error in chunk {chunk_index+1}: {str(e)}. Skipping..."
                        qa_pairs = None
                    else:
                        # For other errors, just log and continue
                        print(f"Error processing chunk {chunk_index+1}: {e}")
                        raw_output = f"Error in chunk {chunk_index+1}: {str(e)}"
                        qa_pairs = None
                
                if qa_pairs:
                    all_qa_pairs.extend(qa_pairs)
                all_raw_outputs.append(raw_output)
                
                # Check RAM usage after processing this chunk
                current_ram = get_process_memory_usage()
                print(f"RAM after chunk {chunk_index+1}: {current_ram:.2f}GB")
            
            # Do a thorough cleanup after each batch
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            
            # Force garbage collection between batches
            import gc
            gc.collect()
            
            # Check if we need to abort due to memory constraints
            current_ram = get_process_memory_usage()
            if current_ram > MAX_RAM_GB:
                print(f"WARNING: Exceeding RAM limit ({current_ram:.2f}GB). Stopping further processing.")
                if progress is not None:
                    progress(1.0, f"Stopped early due to high memory usage ({current_ram:.2f}GB)")
                break
        
        if progress is not None:
            progress(1.0, "Finished processing")
        
        # Final cache clear and garbage collection
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        import gc
        gc.collect()
            
        if not all_qa_pairs:
            return None, "Failed to generate Q&A pairs", "\n\n".join(all_raw_outputs), "No data generated"
        
        # Save data to file
        filename = save_data(
            all_qa_pairs,
            output_file_format,
            "qa_dataset"
        )
        
        # Format for display
        formatted_data = format_data_preview(all_qa_pairs)
        
        # Final memory report
        final_ram = get_process_memory_usage()
        print(f"Final RAM usage: {final_ram:.2f}GB")
        
        return all_qa_pairs, formatted_data, "\n\n".join(all_raw_outputs), f"Data saved to {filename}"
    except Exception as e:
        error_msg = f"Error processing PDF: {str(e)}"
        print(error_msg)
        import traceback
        print(traceback.format_exc())
        return None, error_msg, "", "Processing failed"

# Set up the Gradio interface
def create_interface():
    with gr.Blocks(title="PDF Q&A Dataset Generator") as app:
        gr.Markdown("# πŸ“š PDF Q&A Dataset Generator")
        gr.Markdown("""
        Generate question & answer datasets from PDF documents using instruction-tuned language models.
        Perfect for creating educational resources, quiz materials, or training data for Q&A systems.
        """)
        
        with gr.Tabs() as tabs:
            with gr.TabItem("Generate Q&A Dataset"):
                with gr.Row():
                    with gr.Column(scale=1):
                        pdf_file = gr.File(
                            label="Upload PDF",
                            file_types=[".pdf"],
                            type="binary"
                        )
                        
                        model_dropdown = gr.Dropdown(
                            choices=load_models(),
                            value=DEFAULT_MODEL,
                            label="Model"
                        )
                        
                        num_questions = gr.Slider(
                            minimum=1,
                            maximum=5,
                            value=3,
                            step=1,
                            label="Questions per Section"
                        )
                        
                        include_tags = gr.Checkbox(
                            value=True,
                            label="Include Tags"
                        )
                        
                        include_difficulty = gr.Checkbox(
                            value=True,
                            label="Include Difficulty Levels"
                        )
                        
                        output_file_format = gr.Radio(
                            choices=["json", "csv", "excel"],
                            value="json",
                            label="Save File Format"
                        )
                        
                        generate_btn = gr.Button("Generate Q&A Dataset", variant="primary")
                        
                        progress_bar = gr.Progress()
                    
                    with gr.Column(scale=2):
                        with gr.Tab("Parsed Data"):
                            parsed_data_output = gr.JSON(label="Generated Q&A Pairs")
                            formatted_data_output = gr.Textbox(
                                label="Formatted Preview",
                                lines=15
                            )
                        
                        with gr.Tab("Raw Output"):
                            raw_output = gr.Textbox(
                                label="Raw Model Output",
                                lines=15
                            )
                        
                        file_output = gr.Textbox(label="File Output")
            
            with gr.TabItem("Documentation"):
                gr.Markdown("""
                ## How to Use

                1. **Upload a PDF**: Select a PDF document containing the content you want to generate questions from.
                2. **Select a model**: Choose an instruction-tuned language model from the dropdown.
                3. **Configure settings**: 
                   - Set the number of questions to generate per text section
                   - Choose whether to include tags and difficulty levels
                   - Select your preferred output file format
                4. **Generate dataset**: Click the "Generate Q&A Dataset" button to create your dataset.

                ## About This App

                This app uses instruction-tuned language models to generate question and answer pairs from PDF documents. It:

                1. Extracts text from the uploaded PDF
                2. Splits the text into manageable chunks
                3. Generates questions, answers, tags, and difficulty levels for each chunk
                4. Combines all Q&A pairs into a comprehensive dataset

                ### Features:
                - Automatic text extraction from PDFs
                - Smart text chunking to maintain context
                - Customizable number of questions per chunk
                - Optional tagging and difficulty classification
                - Multiple output formats (JSON, CSV, Excel)

                ### Use Cases:
                - Create educational resources and quiz materials
                - Generate training data for Q&A systems
                - Build flashcard datasets for studying
                - Develop content for educational applications
                """)
                
            with gr.TabItem("Status"):
                gr.Markdown("""
                ## System Status
                
                This app runs on CPU mode. Some larger models might be slower to load and generate content.
                If you encounter any issues with a specific model, try switching to a smaller model like `tiiuae/falcon-7b-instruct`.
                
                ### Troubleshooting
                
                - If the app seems unresponsive after clicking "Generate", please be patient - model loading may take time.
                - If you get an error about model loading, try refreshing the page and selecting a different model.
                - Not all PDFs can be properly processed - if text extraction fails, try with a different PDF.
                """)
        
        # Event handler for generate button
        generate_btn.click(
            process_pdf_generate_qa,
            inputs=[
                pdf_file,
                model_dropdown,
                num_questions,
                include_tags,
                include_difficulty,
                output_file_format
            ],
            outputs=[parsed_data_output, formatted_data_output, raw_output, file_output],
            show_progress=True
        )
    
    return app

# Export the app for Hugging Face Spaces
app = create_interface()

# Launch the app depending on the environment
if __name__ == "__main__":
    app.launch()