Spaces:

FreestylerAI
/

pdf-dataset-generator

Sleeping

App Files Files Community

FreestylerAI commited on May 8

Commit

fbf0ed4

verified ·

1 Parent(s): 418d982

indev-v1

Browse files

Files changed (3) hide show

README.md +79 -14
app.py +813 -0
requirements.txt +9 -0

README.md CHANGED Viewed

@@ -1,14 +1,79 @@
----
-title: Pdf Dataset Generator
-emoji: 📊
-colorFrom: indigo
-colorTo: yellow
-sdk: gradio
-sdk_version: 5.29.0
-app_file: app.py
-pinned: false
-license: cc-by-sa-4.0
-short_description: Turns PDF-Files into datasets.
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: PDF Q&A Dataset Generator
+emoji: 📚
+colorFrom: blue
+colorTo: indigo
+sdk: gradio
+sdk_version: 5.29.0
+app_file: app.py
+pinned: false
+---
+# PDF Q&A Dataset Generator
+A Gradio application that generates Q&A datasets from PDF documents using instruction-tuned language models.
+## Features
+- **PDF Processing**: Automatically extract and chunk text from uploaded PDFs
+- **Q&A Generation**: Create questions, answers, tags, and difficulty levels
+- **Multiple Models**: Choose from various instruction-tuned models
+- **Customization**: Configure number of questions, tags, and difficulty settings
+- **Multiple Output Formats**: Export datasets as JSON, CSV, or Excel
+## How It Works
+This application:
+1. Extracts text from uploaded PDFs
+2. Splits the content into manageable chunks to maintain context
+3. Uses instruction-tuned language models to generate Q&A pairs with tags
+4. Combines these into a comprehensive dataset ready for use
+## Use Cases
+- Creating educational resources and assessment materials
+- Generating training data for Q&A systems
+- Building flashcard datasets for studying
+- Developing content for educational applications
+- Preparing comprehension testing materials
+## Getting Started
+### Local Installation
+```bash
+git clone https://github.com/your-username/pdf-qa-generator.git
+cd pdf-qa-generator
+pip install -r requirements.txt
+python app.py
+```
+### Using on Hugging Face Spaces
+1. Duplicate this Space to your account
+2. Upload your PDFs
+3. Configure your settings
+4. Generate your Q&A dataset
+### Enabling GPU on Hugging Face Spaces
+To enable GPU acceleration on Hugging Face Spaces:
+1. Uncomment the `# import spaces` line at the top of app.py
+2. Uncomment the `# @spaces.GPU` decorator above the `process_pdf_generate_qa` function
+3. Save and redeploy your Space with GPU hardware selected
+## Models
+The app includes a selection of instruction-tuned language models:
+- `databricks/dolly-v2-3b` (default)
+- `databricks/dolly-v2-7b`
+- `EleutherAI/gpt-neo-1.3B`
+- `EleutherAI/gpt-neo-2.7B`
+- `tiiuae/falcon-7b-instruct`
+## License
+MIT

app.py ADDED Viewed

	@@ -0,0 +1,813 @@

+import os
+import json
+import pandas as pd
+import gradio as gr
+import spaces
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+import csv
+import yaml
+from typing import List, Dict, Any
+import random
+from pypdf import PdfReader
+import re
+import tempfile
+from huggingface_hub import HfApi
+# Configuration
+DEFAULT_MODEL = "databricks/dolly-v2-3b"  # Smaller, more suitable for Spaces
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"  # Try to use CUDA if available
+MAX_NEW_TOKENS = 512
+TEMPERATURE = 0.7
+HF_TOKEN = os.environ.get("HF_TOKEN") if os.environ.get("HF_TOKEN") else None  # Get token from environment variables
+MAX_RAM_GB = 45  # Set maximum RAM usage to 45GB (below the 70GB limit)
+# Create offload folder for model memory management
+os.makedirs("offload_folder", exist_ok=True)
+# Setup RAM monitoring
+def get_process_memory_usage():
+    """Get the current memory usage of this process in GB"""
+    import psutil
+    process = psutil.Process(os.getpid())
+    return process.memory_info().rss / (1024 * 1024 * 1024)  # Convert to GB
+class PdfExtractor:
+    """Extract text content from PDF files"""
+    @staticmethod
+    def extract_text_from_pdf(pdf_file):
+        """Extract text from a PDF file"""
+        try:
+            reader = PdfReader(pdf_file)
+            text = ""
+            for page in reader.pages:
+                text += page.extract_text() + "\n"
+            return text
+        except Exception as e:
+            print(f"Error extracting text from PDF: {e}")
+            return None
+    @staticmethod
+    def clean_text(text):
+        """Clean and preprocess extracted text"""
+        if not text:
+            return ""
+        # Replace multiple newlines with single newline
+        text = re.sub(r'\n+', '\n', text)
+        # Replace multiple spaces with single space
+        text = re.sub(r'\s+', ' ', text)
+        return text.strip()
+    @staticmethod
+    def chunk_text(text, max_chunk_size=1000, overlap=100):
+        """Split text into chunks of specified size with overlap"""
+        if not text:
+            return []
+        chunks = []
+        start = 0
+        text_length = len(text)
+        while start < text_length:
+            end = min(start + max_chunk_size, text_length)
+            # If we're not at the end, try to break at a sentence or paragraph
+            if end < text_length:
+                # Look for sentence breaks (period, question mark, exclamation mark followed by space)
+                sentence_break = max(
+                    text.rfind('. ', start, end),
+                    text.rfind('? ', start, end),
+                    text.rfind('! ', start, end),
+                    text.rfind('\n', start, end)
+                )
+                if sentence_break > start + max_chunk_size // 2:
+                    end = sentence_break + 1
+            chunks.append(text[start:end].strip())
+            start = end - overlap  # Create overlap with previous chunk
+        return chunks
+class SyntheticDataGenerator:
+    def __init__(self, model_name=DEFAULT_MODEL):
+        self.model_name = model_name
+        self.model = None
+        self.tokenizer = None
+        self.fallback_models = [
+            "databricks/dolly-v2-3b",  # Smallest, most reliable model as primary fallback
+            "EleutherAI/gpt-neo-1.3B", # Second fallback option
+        ]
+        # Don't try to load the model in init - we'll load it when needed
+        # This prevents initialization errors from blocking the app startup
+    def load_model(self):
+        """Load the specified model or fall back to a smaller model if loading fails"""
+        # Clear CUDA cache if using GPU to prevent memory fragmentation
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            # Try initializing CUDA explicitly to catch early errors
+            try:
+                torch.tensor([1.0], device="cuda")
+            except Exception as e:
+                print(f"CUDA initialization error: {e}")
+        models_to_try = [self.model_name]
+        # Add fallback models only if the requested model isn't already in the fallback list
+        if self.model_name not in self.fallback_models:
+            models_to_try.extend(self.fallback_models)
+        for model_name in models_to_try:
+            try:
+                print(f"Loading model {model_name} on {DEVICE}...")
+                # Add token for authentication if available
+                tokenizer_kwargs = {}
+                model_kwargs = {
+                    "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
+                    "device_map": "auto" if torch.cuda.is_available() else None,
+                    "low_cpu_mem_usage": True,  # Added to reduce memory usage on CPU
+                    "offload_folder": "offload_folder"  # Add offload folder for large models
+                }
+                # Handle potential CUDA out-of-memory errors with smaller settings
+                if torch.cuda.is_available():
+                    try:
+                        # Check available GPU memory
+                        free_memory, total_memory = torch.cuda.mem_get_info()
+                        free_memory_gb = free_memory / (1024**3)
+                        total_memory_gb = total_memory / (1024**3)
+                        print(f"GPU memory: {free_memory_gb:.2f}GB free out of {total_memory_gb:.2f}GB total")
+                        # If we're running low on memory (this is common in Spaces), use more aggressive memory saving
+                        model_kwargs["max_memory"] = {0: f"{max(free_memory_gb*0.8, 0.5)}GB", "cpu": "8GB"}
+                        # For smaller GPUs or when memory is constrained, use more aggressive offloading
+                        if free_memory_gb < 4.0:  # Less than 4GB free
+                            print("Low GPU memory detected. Using CPU offloading...")
+                            # More conservative memory map to prevent ZeroGPU errors
+                            model_kwargs["device_map"] = "auto"
+                            model_kwargs["offload_state_dict"] = True  # More aggressive offloading
+                    except Exception as memory_check_error:
+                        print(f"Error checking GPU memory: {memory_check_error}")
+                        # Continue with default settings but with safeguards
+                        model_kwargs["device_map"] = "auto"  # Let the library decide the best mapping
+                # Add token for authentication if available and model is gated
+                if HF_TOKEN:
+                    tokenizer_kwargs["token"] = HF_TOKEN
+                    model_kwargs["token"] = HF_TOKEN
+                    print("Using Hugging Face token for authentication")
+                # Load tokenizer with safeguards against ZeroGPU issues
+                try:
+                    self.tokenizer = AutoTokenizer.from_pretrained(model_name, **tokenizer_kwargs)
+                except Exception as tokenizer_error:
+                    print(f"Error loading tokenizer: {tokenizer_error}")
+                    # Try loading with additional safety settings
+                    tokenizer_kwargs["local_files_only"] = False
+                    tokenizer_kwargs["revision"] = "main"
+                    self.tokenizer = AutoTokenizer.from_pretrained(model_name, **tokenizer_kwargs)
+                # Load the model with ZeroGPU error prevention
+                try:
+                    self.model = AutoModelForCausalLM.from_pretrained(
+                        model_name,
+                        **model_kwargs
+                    )
+                except RuntimeError as e:
+                    if "CUDA" in str(e) or "GPU" in str(e) or "out of memory" in str(e):
+                        print(f"CUDA error loading model: {e}")
+                        # Fall back to CPU if GPU fails
+                        print("Falling back to CPU for model loading")
+                        model_kwargs["device_map"] = {"": "cpu"}
+                        model_kwargs["torch_dtype"] = torch.float32
+                        self.model = AutoModelForCausalLM.from_pretrained(
+                            model_name,
+                            **model_kwargs
+                        )
+                    else:
+                        raise
+                # Ensure model is on the right device if not using device_map="auto"
+                if not torch.cuda.is_available():
+                    self.model = self.model.to(DEVICE)
+                # If we loaded a fallback model instead of the requested one, update the model_name
+                if model_name != self.model_name:
+                    print(f"Successfully loaded fallback model {model_name} instead of {self.model_name}")
+                    self.model_name = model_name
+                else:
+                    print(f"Model loaded successfully on {DEVICE}")
+                return True
+            except Exception as e:
+                print(f"Error loading model {model_name}: {e}")
+                self.model = None
+                self.tokenizer = None
+                # Continue to the next model in the list
+        # If we get here, all models failed
+        print("All models failed to load")
+        return False
+    def generate_qa_prompt(self, context, num_questions=3, include_tags=True, difficulty_levels=True):
+        """Generate a prompt for creating Q&A pairs from context."""
+        tag_instruction = ""
+        if include_tags:
+            tag_instruction = "Add 1-3 tags for each question that categorize the topic or subject matter."
+        difficulty_instruction = ""
+        if difficulty_levels:
+            difficulty_instruction = "For each question, assign a difficulty level (easy, medium, or hard)."
+        prompt = f"""Task: Based on the following text, generate {num_questions} question and answer pairs that would be useful for comprehension testing or knowledge assessment.
+CONTEXT:
+{context}
+For each question:
+1. Write a clear, specific question about the information in the text
+2. Provide the correct answer to the question, citing relevant details from the text
+3. {tag_instruction}
+4. {difficulty_instruction}
+Format each Q&A pair as a JSON object with the following structure:
+{{
+  "question": "The question text",
+  "answer": "The answer text",
+  "tags": ["tag1", "tag2"],
+  "difficulty": "easy/medium/hard"
+}}
+Return all Q&A pairs in a JSON array.
+"""
+        return prompt
+    def generate_data(self, prompt, num_samples=1):
+        """Generate synthetic data using the loaded model."""
+        if not self.model or not self.tokenizer:
+            return ["Error: Model not loaded properly. Please try again with a different model."]
+        outputs = []
+        for sample_idx in range(num_samples):
+            try:
+                # Clear CUDA cache before generating to free up memory
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                # ZeroGPU errors often occur in generate() calls
+                # To mitigate this, try multiple approaches in sequence
+                inputs = self.tokenizer(prompt, return_tensors="pt").to(DEVICE)
+                try:
+                    # First try: Standard generation with conservative settings
+                    with torch.no_grad():
+                        output = self.model.generate(
+                            **inputs,
+                            max_new_tokens=MAX_NEW_TOKENS,
+                            temperature=TEMPERATURE,
+                            do_sample=True,
+                            pad_token_id=self.tokenizer.eos_token_id,
+                            num_beams=1,  # Use greedy decoding instead of beam search
+                            early_stopping=True,
+                            no_repeat_ngram_size=3  # Prevent repetition
+                        )
+                    decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
+                except (RuntimeError, Exception) as e:
+                    if "CUDA" in str(e) or "GPU" in str(e) or "ZeroGPU" in str(e):
+                        print(f"GPU error during generation: {e}")
+                        print("Falling back to CPU generation...")
+                        # Move everything to CPU
+                        inputs = {k: v.to('cpu') for k, v in inputs.items()}
+                        # Create CPU copy of the model if we were using GPU
+                        if torch.cuda.is_available():
+                            # Temporarily move model to CPU for this generation
+                            model_cpu = self.model.to('cpu')
+                            with torch.no_grad():
+                                output = model_cpu.generate(
+                                    **inputs,
+                                    max_new_tokens=MAX_NEW_TOKENS,
+                                    temperature=TEMPERATURE,
+                                    do_sample=True,
+                                    pad_token_id=self.tokenizer.eos_token_id,
+                                    num_return_sequences=1,
+                                    max_length=MAX_NEW_TOKENS + inputs['input_ids'].shape[1]
+                                )
+                            decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
+                            # Move model back to CUDA for future calls
+                            self.model = self.model.to(DEVICE)
+                        else:
+                            # Already on CPU, try with reduced parameters
+                            with torch.no_grad():
+                                output = self.model.generate(
+                                    **inputs,
+                                    max_new_tokens=min(256, MAX_NEW_TOKENS),  # Reduce token count
+                                    temperature=0.5,  # Lower temperature
+                                    do_sample=False,  # No sampling
+                                    num_return_sequences=1,
+                                    pad_token_id=self.tokenizer.eos_token_id
+                                )
+                            decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
+                    else:
+                        # Re-raise non-CUDA errors
+                        raise
+                # Extract only the generated part (remove prompt)
+                prompt_text = self.tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
+                generated_text = decoded_output[len(prompt_text):].strip()
+                outputs.append(generated_text)
+                # Clear CUDA cache between samples
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+            except Exception as e:
+                error_msg = f"Error generating sample {sample_idx+1}: {str(e)}"
+                print(error_msg)
+                outputs.append(f"Error: {error_msg}")
+        return outputs
+    def parse_json_data(self, generated_text):
+        """Extract and parse JSON from generated text."""
+        try:
+            # Find JSON-like content (between [ and ])
+            start_idx = generated_text.find('[')
+            end_idx = generated_text.rfind(']') + 1
+            if start_idx >= 0 and end_idx > start_idx:
+                json_str = generated_text[start_idx:end_idx]
+                return json.loads(json_str)
+            # Try to find single object format
+            start_idx = generated_text.find('{')
+            end_idx = generated_text.rfind('}') + 1
+            if start_idx >= 0 and end_idx > start_idx:
+                json_str = generated_text[start_idx:end_idx]
+                return json.loads(json_str)
+            print(f"Could not find JSON content in: {generated_text}")
+            return None
+        except json.JSONDecodeError as e:
+            print(f"JSON parse error: {e}")
+            print(f"Problematic text: {generated_text}")
+            # Try to find and fix common JSON formatting errors
+            try:
+                # Replace single quotes with double quotes
+                json_str = generated_text[start_idx:end_idx].replace("'", "\"")
+                return json.loads(json_str)
+            except:
+                pass
+            # If still failing, try to extract individual JSON objects
+            try:
+                pattern = r'\{[^{}]*\}'
+                matches = re.findall(pattern, generated_text)
+                if matches:
+                    results = []
+                    for match in matches:
+                        try:
+                            # Replace single quotes with double quotes
+                            fixed_match = match.replace("'", "\"")
+                            obj = json.loads(fixed_match)
+                            results.append(obj)
+                        except:
+                            continue
+                    if results:
+                        return results
+            except:
+                pass
+            return None
+    def generate_qa_from_pdf_chunk(self, chunk, num_questions=3, include_tags=True, difficulty_levels=True):
+        """Generate Q&A pairs from a PDF text chunk."""
+        if not self.model or not self.tokenizer:
+            return [], "Error: Model not loaded properly. Please try again with a different model."
+        if not chunk or len(chunk.strip()) < 100:  # Skip very small chunks
+            return [], "Chunk too small to generate meaningful Q&A pairs."
+        prompt = self.generate_qa_prompt(chunk, num_questions, include_tags, difficulty_levels)
+        raw_outputs = self.generate_data(prompt, num_samples=1)
+        raw_output = raw_outputs[0]
+        parsed_data = self.parse_json_data(raw_output)
+        # Ensure parsed data is a list
+        if parsed_data and isinstance(parsed_data, dict):
+            parsed_data = [parsed_data]
+        # Return both the parsed data and raw output for debugging
+        return parsed_data, raw_output
+def format_data_preview(data):
+    """Format the data for preview in the UI."""
+    if isinstance(data, list):
+        if len(data) > 0 and isinstance(data[0], dict):
+            # Convert list of dicts to DataFrame for better display
+            return pd.DataFrame(data).to_string()
+        else:
+            return json.dumps(data, indent=2)
+    elif isinstance(data, dict):
+        return json.dumps(data, indent=2)
+    else:
+        return str(data)
+def save_data(data, format, filename_prefix):
+    """Save data to a file in the specified format."""
+    os.makedirs("synthetic_data", exist_ok=True)
+    timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"synthetic_data/{filename_prefix}_{timestamp}"
+    if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
+        df = pd.DataFrame(data)
+        if format.lower() == "csv":
+            full_filename = f"{filename}.csv"
+            df.to_csv(full_filename, index=False)
+        elif format.lower() == "json":
+            full_filename = f"{filename}.json"
+            with open(full_filename, "w") as f:
+                json.dump(data, f, indent=2)
+        elif format.lower() == "excel":
+            full_filename = f"{filename}.xlsx"
+            df.to_excel(full_filename, index=False)
+        else:
+            full_filename = f"{filename}.txt"
+            with open(full_filename, "w") as f:
+                f.write(str(data))
+    else:
+        full_filename = f"{filename}.{format.lower()}"
+        with open(full_filename, "w") as f:
+            if format.lower() == "json":
+                json.dump(data, f, indent=2)
+            else:
+                f.write(str(data))
+    return full_filename
+def load_models():
+    """Return a list of available models."""
+    return [
+        "databricks/dolly-v2-3b",
+        "databricks/dolly-v2-7b",
+        "EleutherAI/gpt-neo-1.3B",
+        "EleutherAI/gpt-neo-2.7B",
+        "tiiuae/falcon-7b-instruct"
+    ]
+@spaces.GPU
+def process_pdf_generate_qa(pdf_file, model_name, num_questions_per_chunk, include_tags, include_difficulty, output_file_format, progress=None):
+    """Process a PDF file and generate Q&A pairs from its content."""
+    if pdf_file is None:
+        return None, "Error: No PDF file uploaded", "", "No file provided"
+    try:
+        # Check RAM usage at start
+        current_ram_usage = get_process_memory_usage()
+        print(f"Starting RAM usage: {current_ram_usage:.2f}GB")
+        # Clear CUDA cache before starting
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        # Initialize extractor and generator
+        extractor = PdfExtractor()
+        generator = SyntheticDataGenerator(model_name)
+        # Wrap model loading in try-except to handle errors
+        try:
+            load_success = generator.load_model()
+            if not load_success:
+                return None, "Error: Failed to load the model. Please try again with a different model.", "", "Model loading failed"
+        except Exception as e:
+            if "ZeroGPU" in str(e) or "GPU task aborted" in str(e) or "CUDA" in str(e):
+                print(f"GPU error during model loading: {e}. Trying with a smaller model...")
+                # If we get a ZeroGPU error, immediately try the smallest model
+                generator.model_name = "EleutherAI/gpt-neo-1.3B"  # Use smallest model as emergency fallback
+                load_success = generator.load_model()
+                if not load_success:
+                    return None, "Error: Failed to load any model even after fallback. Please try again later.", "", "Model loading failed"
+            else:
+                # Re-raise other errors
+                raise
+        # Check RAM usage after model loading
+        ram_after_model = get_process_memory_usage()
+        print(f"RAM usage after model loading: {ram_after_model:.2f}GB")
+        # Save PDF temporarily if it's a file object
+        if hasattr(pdf_file, 'name'):
+            # It's already a file path
+            pdf_path = pdf_file.name
+        else:
+            # Create a temporary file
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
+                tmp.write(pdf_file)
+                pdf_path = tmp.name
+        # Extract text from PDF
+        pdf_text = extractor.extract_text_from_pdf(pdf_path)
+        if not pdf_text:
+            return None, "Failed to extract text from PDF", "", "No data generated"
+        # Clean and chunk the text - reduce chunk size to use less memory
+        cleaned_text = extractor.clean_text(pdf_text)
+        chunks = extractor.chunk_text(cleaned_text, max_chunk_size=400, overlap=30)
+        # Check RAM after PDF processing
+        ram_after_pdf = get_process_memory_usage()
+        print(f"RAM usage after PDF processing: {ram_after_pdf:.2f}GB, found {len(chunks)} chunks")
+        # If we're approaching the RAM limit already, reduce batch size
+        batch_size = 3  # Default
+        if ram_after_pdf > MAX_RAM_GB * 0.7:  # If already using 70% of our limit
+            batch_size = 1  # Process one chunk at a time
+            print(f"High RAM usage detected ({ram_after_pdf:.2f}GB), reducing batch size to 1")
+        elif ram_after_pdf > MAX_RAM_GB * 0.5:  # If using 50% of our limit
+            batch_size = 2  # Process two chunks at a time
+            print(f"Moderate RAM usage detected ({ram_after_pdf:.2f}GB), reducing batch size to 2")
+        # Generate Q&A pairs for each chunk
+        all_qa_pairs = []
+        all_raw_outputs = []
+        total_chunks = len(chunks)
+        # Process chunks in smaller batches to avoid memory buildup
+        for i in range(0, total_chunks, batch_size):
+            # Get the current batch of chunks
+            batch_chunks = chunks[i:min(i+batch_size, total_chunks)]
+            # Process each chunk in the batch
+            for j, chunk in enumerate(batch_chunks):
+                chunk_index = i + j
+                if progress is not None:
+                    progress(chunk_index / total_chunks, f"Processing chunk {chunk_index+1}/{total_chunks}")
+                # Check if we're approaching RAM limit
+                current_ram = get_process_memory_usage()
+                if current_ram > MAX_RAM_GB * 0.9:  # Over 90% of our limit
+                    print(f"WARNING: High RAM usage detected: {current_ram:.2f}GB - force releasing memory")
+                    import gc
+                    gc.collect()  # Force garbage collection
+                    if torch.cuda.is_available():
+                        torch.cuda.empty_cache()
+                    # If still too high after garbage collection, abort batch processing
+                    current_ram = get_process_memory_usage()
+                    if current_ram > MAX_RAM_GB * 0.95:  # Still dangerously high
+                        print(f"CRITICAL: RAM usage too high ({current_ram:.2f}GB), stopping processing")
+                        break
+                # Clear CUDA cache between chunks
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                try:
+                    qa_pairs, raw_output = generator.generate_qa_from_pdf_chunk(
+                        chunk,
+                        num_questions=num_questions_per_chunk,
+                        include_tags=include_tags,
+                        difficulty_levels=include_difficulty
+                    )
+                except Exception as e:
+                    error_type = str(e)
+                    if "CUDA" in error_type or "GPU" in error_type or "ZeroGPU" in error_type:
+                        print(f"GPU error during generation for chunk {chunk_index+1}: {e}")
+                        # Fall back to CPU for this specific generation
+                        raw_output = f"Error in chunk {chunk_index+1}: {str(e)}. Skipping..."
+                        qa_pairs = None
+                    elif "memory" in error_type.lower() or "ram" in error_type.lower():
+                        print(f"Memory error processing chunk {chunk_index+1}: {e}")
+                        # Force garbage collection and skip chunk
+                        import gc
+                        gc.collect()
+                        if torch.cuda.is_available():
+                            torch.cuda.empty_cache()
+                        raw_output = f"Memory error in chunk {chunk_index+1}: {str(e)}. Skipping..."
+                        qa_pairs = None
+                    else:
+                        # For other errors, just log and continue
+                        print(f"Error processing chunk {chunk_index+1}: {e}")
+                        raw_output = f"Error in chunk {chunk_index+1}: {str(e)}"
+                        qa_pairs = None
+                if qa_pairs:
+                    all_qa_pairs.extend(qa_pairs)
+                all_raw_outputs.append(raw_output)
+                # Check RAM usage after processing this chunk
+                current_ram = get_process_memory_usage()
+                print(f"RAM after chunk {chunk_index+1}: {current_ram:.2f}GB")
+            # Do a thorough cleanup after each batch
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            # Force garbage collection between batches
+            import gc
+            gc.collect()
+            # Check if we need to abort due to memory constraints
+            current_ram = get_process_memory_usage()
+            if current_ram > MAX_RAM_GB:
+                print(f"WARNING: Exceeding RAM limit ({current_ram:.2f}GB). Stopping further processing.")
+                if progress is not None:
+                    progress(1.0, f"Stopped early due to high memory usage ({current_ram:.2f}GB)")
+                break
+        if progress is not None:
+            progress(1.0, "Finished processing")
+        # Final cache clear and garbage collection
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        import gc
+        gc.collect()
+        if not all_qa_pairs:
+            return None, "Failed to generate Q&A pairs", "\n\n".join(all_raw_outputs), "No data generated"
+        # Save data to file
+        filename = save_data(
+            all_qa_pairs,
+            output_file_format,
+            "qa_dataset"
+        )
+        # Format for display
+        formatted_data = format_data_preview(all_qa_pairs)
+        # Final memory report
+        final_ram = get_process_memory_usage()
+        print(f"Final RAM usage: {final_ram:.2f}GB")
+        return all_qa_pairs, formatted_data, "\n\n".join(all_raw_outputs), f"Data saved to {filename}"
+    except Exception as e:
+        error_msg = f"Error processing PDF: {str(e)}"
+        print(error_msg)
+        import traceback
+        print(traceback.format_exc())
+        return None, error_msg, "", "Processing failed"
+# Set up the Gradio interface
+def create_interface():
+    with gr.Blocks(title="PDF Q&A Dataset Generator") as app:
+        gr.Markdown("# 📚 PDF Q&A Dataset Generator")
+        gr.Markdown("""
+        Generate question & answer datasets from PDF documents using instruction-tuned language models.
+        Perfect for creating educational resources, quiz materials, or training data for Q&A systems.
+        """)
+        with gr.Tabs() as tabs:
+            with gr.TabItem("Generate Q&A Dataset"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        pdf_file = gr.File(
+                            label="Upload PDF",
+                            file_types=[".pdf"],
+                            type="binary"
+                        )
+                        model_dropdown = gr.Dropdown(
+                            choices=load_models(),
+                            value=DEFAULT_MODEL,
+                            label="Model"
+                        )
+                        num_questions = gr.Slider(
+                            minimum=1,
+                            maximum=5,
+                            value=3,
+                            step=1,
+                            label="Questions per Section"
+                        )
+                        include_tags = gr.Checkbox(
+                            value=True,
+                            label="Include Tags"
+                        )
+                        include_difficulty = gr.Checkbox(
+                            value=True,
+                            label="Include Difficulty Levels"
+                        )
+                        output_file_format = gr.Radio(
+                            choices=["json", "csv", "excel"],
+                            value="json",
+                            label="Save File Format"
+                        )
+                        generate_btn = gr.Button("Generate Q&A Dataset", variant="primary")
+                        progress_bar = gr.Progress()
+                    with gr.Column(scale=2):
+                        with gr.Tab("Parsed Data"):
+                            parsed_data_output = gr.JSON(label="Generated Q&A Pairs")
+                            formatted_data_output = gr.Textbox(
+                                label="Formatted Preview",
+                                lines=15
+                            )
+                        with gr.Tab("Raw Output"):
+                            raw_output = gr.Textbox(
+                                label="Raw Model Output",
+                                lines=15
+                            )
+                        file_output = gr.Textbox(label="File Output")
+            with gr.TabItem("Documentation"):
+                gr.Markdown("""
+                ## How to Use
+                1. **Upload a PDF**: Select a PDF document containing the content you want to generate questions from.
+                2. **Select a model**: Choose an instruction-tuned language model from the dropdown.
+                3. **Configure settings**:
+                   - Set the number of questions to generate per text section
+                   - Choose whether to include tags and difficulty levels
+                   - Select your preferred output file format
+                4. **Generate dataset**: Click the "Generate Q&A Dataset" button to create your dataset.
+                ## About This App
+                This app uses instruction-tuned language models to generate question and answer pairs from PDF documents. It:
+                1. Extracts text from the uploaded PDF
+                2. Splits the text into manageable chunks
+                3. Generates questions, answers, tags, and difficulty levels for each chunk
+                4. Combines all Q&A pairs into a comprehensive dataset
+                ### Features:
+                - Automatic text extraction from PDFs
+                - Smart text chunking to maintain context
+                - Customizable number of questions per chunk
+                - Optional tagging and difficulty classification
+                - Multiple output formats (JSON, CSV, Excel)
+                ### Use Cases:
+                - Create educational resources and quiz materials
+                - Generate training data for Q&A systems
+                - Build flashcard datasets for studying
+                - Develop content for educational applications
+                """)
+            with gr.TabItem("Status"):
+                gr.Markdown("""
+                ## System Status
+                This app runs on CPU mode. Some larger models might be slower to load and generate content.
+                If you encounter any issues with a specific model, try switching to a smaller model like `databricks/dolly-v2-3b`.
+                ### Troubleshooting
+                - If the app seems unresponsive after clicking "Generate", please be patient - model loading may take time.
+                - If you get an error about model loading, try refreshing the page and selecting a different model.
+                - Not all PDFs can be properly processed - if text extraction fails, try with a different PDF.
+                """)
+        # Event handler for generate button
+        generate_btn.click(
+            process_pdf_generate_qa,
+            inputs=[
+                pdf_file,
+                model_dropdown,
+                num_questions,
+                include_tags,
+                include_difficulty,
+                output_file_format
+            ],
+            outputs=[parsed_data_output, formatted_data_output, raw_output, file_output],
+            show_progress=True
+        )
+    return app
+# Export the app for Hugging Face Spaces
+app = create_interface()
+# Launch the app depending on the environment
+if __name__ == "__main__":
+    app.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio==5.29.0
+torch>=2.0.0
+transformers>=4.30.0
+pandas>=1.5.0
+PyYAML>=6.0
+openpyxl>=3.1.0
+pypdf>=3.9.0
+huggingface-hub>=0.20.0
+accelerate>=0.25.0