Spaces:

FreestylerAI
/

pdf-dataset-generator

Sleeping

App Files Files Community

pdf-dataset-generator / app.py

FreestylerAI

Update app.py

9d0ffca verified 2 months ago

raw

history blame contribute delete

31.6 kB

	import os
	import json
	import pandas as pd
	import gradio as gr
	import spaces
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch
	import csv
	import yaml
	from typing import List, Dict, Any
	import random
	from pypdf import PdfReader
	import re
	import tempfile
	from huggingface_hub import HfApi

	# Configuration
	DEFAULT_MODEL = "tiiuae/falcon-7b-instruct" # Use Falcon-7B as the default model
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Try to use CUDA if available
	MAX_NEW_TOKENS = 512
	TEMPERATURE = 0.7
	HF_TOKEN = os.environ.get("HF_TOKEN") if os.environ.get("HF_TOKEN") else None # Get token from environment variables
	MAX_RAM_GB = 45 # Set maximum RAM usage to 45GB (below the 70GB limit)

	# Create offload folder for model memory management
	os.makedirs("offload_folder", exist_ok=True)

	# Setup RAM monitoring
	def get_process_memory_usage():
	"""Get the current memory usage of this process in GB"""
	import psutil
	process = psutil.Process(os.getpid())
	return process.memory_info().rss / (1024 * 1024 * 1024) # Convert to GB

	class PdfExtractor:
	"""Extract text content from PDF files"""

	@staticmethod
	def extract_text_from_pdf(pdf_file):
	"""Extract text from a PDF file"""
	try:
	reader = PdfReader(pdf_file)
	text = ""

	for page in reader.pages:
	text += page.extract_text() + "\n"

	return text
	except Exception as e:
	print(f"Error extracting text from PDF: {e}")
	return None

	@staticmethod
	def clean_text(text):
	"""Clean and preprocess extracted text"""
	if not text:
	return ""

	# Replace multiple newlines with single newline
	text = re.sub(r'\n+', '\n', text)

	# Replace multiple spaces with single space
	text = re.sub(r'\s+', ' ', text)

	return text.strip()

	@staticmethod
	def chunk_text(text, max_chunk_size=1000, overlap=100):
	"""Split text into chunks of specified size with overlap"""
	if not text:
	return []

	chunks = []
	start = 0
	text_length = len(text)

	while start < text_length:
	end = min(start + max_chunk_size, text_length)

	# If we're not at the end, try to break at a sentence or paragraph
	if end < text_length:
	# Look for sentence breaks (period, question mark, exclamation mark followed by space)
	sentence_break = max(
	text.rfind('. ', start, end),
	text.rfind('? ', start, end),
	text.rfind('! ', start, end),
	text.rfind('\n', start, end)
	)

	if sentence_break > start + max_chunk_size // 2:
	end = sentence_break + 1

	chunks.append(text[start:end].strip())
	start = end - overlap # Create overlap with previous chunk

	return chunks

	class SyntheticDataGenerator:
	def __init__(self, model_name=DEFAULT_MODEL):
	self.model_name = model_name
	self.model = None
	self.tokenizer = None
	self.load_model() # Load the model directly during initialization

	def load_model(self):
	"""Load the specified model."""
	# Clear CUDA cache if using GPU to prevent memory fragmentation
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	try:
	print(f"Loading model {self.model_name} on {DEVICE}...")

	# Add token for authentication if available
	tokenizer_kwargs = {}
	model_kwargs = {
	"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
	"device_map": "auto" if torch.cuda.is_available() else None,
	"low_cpu_mem_usage": True, # Added to reduce memory usage on CPU
	"offload_folder": "offload_folder" # Add offload folder for large models
	}

	if HF_TOKEN:
	tokenizer_kwargs["token"] = HF_TOKEN
	model_kwargs["token"] = HF_TOKEN
	print("Using Hugging Face token for authentication")

	# Load tokenizer
	self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, **tokenizer_kwargs)

	# Load the model
	self.model = AutoModelForCausalLM.from_pretrained(
	self.model_name,
	**model_kwargs
	)

	# Ensure model is on the right device if not using device_map="auto"
	if not torch.cuda.is_available():
	self.model = self.model.to(DEVICE)

	print(f"Model {self.model_name} loaded successfully on {DEVICE}")
	except Exception as e:
	print(f"Error loading model {self.model_name}: {e}")
	self.model = None
	self.tokenizer = None
	raise

	def generate_qa_prompt(self, context, num_questions=3, include_tags=True, difficulty_levels=True):
	"""Generate a prompt for creating Q&A pairs from context."""
	tag_instruction = ""
	if include_tags:
	tag_instruction = "Add 1-3 tags for each question that categorize the topic or subject matter."

	difficulty_instruction = ""
	if difficulty_levels:
	difficulty_instruction = "For each question, assign a difficulty level (easy, medium, or hard)."

	prompt = f"""Task: Based on the following text, generate {num_questions} question and answer pairs that would be useful for comprehension testing or knowledge assessment.

	CONTEXT:
	{context}

	For each question:
	1. Write a clear, specific question about the information in the text
	2. Provide the correct answer to the question, citing relevant details from the text
	3. {tag_instruction}
	4. {difficulty_instruction}

	Format each Q&A pair as a JSON object with the following structure:
	{{
	"question": "The question text",
	"answer": "The answer text",
	"tags": ["tag1", "tag2"],
	"difficulty": "easy/medium/hard"
	}}

	Return all Q&A pairs in a JSON array.
	"""
	return prompt

	def generate_data(self, prompt, num_samples=1):
	"""Generate synthetic data using the loaded model."""
	if not self.model or not self.tokenizer:
	return ["Error: Model not loaded properly. Please try again with a different model."]

	outputs = []
	for sample_idx in range(num_samples):
	try:
	# Clear CUDA cache before generating to free up memory
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# ZeroGPU errors often occur in generate() calls
	# To mitigate this, try multiple approaches in sequence
	inputs = self.tokenizer(prompt, return_tensors="pt").to(DEVICE)

	try:
	# First try: Standard generation with conservative settings
	with torch.no_grad():
	output = self.model.generate(
	**inputs,
	max_new_tokens=MAX_NEW_TOKENS,
	temperature=TEMPERATURE,
	do_sample=True,
	pad_token_id=self.tokenizer.eos_token_id,
	num_beams=1, # Use greedy decoding instead of beam search
	early_stopping=True,
	no_repeat_ngram_size=3 # Prevent repetition
	)

	decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
	except (RuntimeError, Exception) as e:
	if "CUDA" in str(e) or "GPU" in str(e) or "ZeroGPU" in str(e):
	print(f"GPU error during generation: {e}")
	print("Falling back to CPU generation...")

	# Move everything to CPU
	inputs = {k: v.to('cpu') for k, v in inputs.items()}

	# Create CPU copy of the model if we were using GPU
	if torch.cuda.is_available():
	# Temporarily move model to CPU for this generation
	model_cpu = self.model.to('cpu')

	with torch.no_grad():
	output = model_cpu.generate(
	**inputs,
	max_new_tokens=MAX_NEW_TOKENS,
	temperature=TEMPERATURE,
	do_sample=True,
	pad_token_id=self.tokenizer.eos_token_id,
	num_return_sequences=1,
	max_length=MAX_NEW_TOKENS + inputs['input_ids'].shape[1]
	)
	decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)

	# Move model back to CUDA for future calls
	self.model = self.model.to(DEVICE)
	else:
	# Already on CPU, try with reduced parameters
	with torch.no_grad():
	output = self.model.generate(
	**inputs,
	max_new_tokens=min(256, MAX_NEW_TOKENS), # Reduce token count
	temperature=0.5, # Lower temperature
	do_sample=False, # No sampling
	num_return_sequences=1,
	pad_token_id=self.tokenizer.eos_token_id
	)
	decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
	else:
	# Re-raise non-CUDA errors
	raise

	# Extract only the generated part (remove prompt)
	prompt_text = self.tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
	generated_text = decoded_output[len(prompt_text):].strip()
	outputs.append(generated_text)

	# Clear CUDA cache between samples
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	except Exception as e:
	error_msg = f"Error generating sample {sample_idx+1}: {str(e)}"
	print(error_msg)
	outputs.append(f"Error: {error_msg}")

	return outputs

	def parse_json_data(self, generated_text):
	"""Extract and parse JSON from generated text."""
	try:
	# Find JSON-like content (between [ and ])
	start_idx = generated_text.find('[')
	end_idx = generated_text.rfind(']') + 1

	if start_idx >= 0 and end_idx > start_idx:
	json_str = generated_text[start_idx:end_idx]
	return json.loads(json_str)

	# Try to find single object format
	start_idx = generated_text.find('{')
	end_idx = generated_text.rfind('}') + 1

	if start_idx >= 0 and end_idx > start_idx:
	json_str = generated_text[start_idx:end_idx]
	return json.loads(json_str)

	print(f"Could not find JSON content in: {generated_text}")
	return None
	except json.JSONDecodeError as e:
	print(f"JSON parse error: {e}")
	print(f"Problematic text: {generated_text}")

	# Try to find and fix common JSON formatting errors
	try:
	# Replace single quotes with double quotes
	json_str = generated_text[start_idx:end_idx].replace("'", "\"")
	return json.loads(json_str)
	except:
	pass

	# If still failing, try to extract individual JSON objects
	try:
	pattern = r'\{[^{}]*\}'
	matches = re.findall(pattern, generated_text)
	if matches:
	results = []
	for match in matches:
	try:
	# Replace single quotes with double quotes
	fixed_match = match.replace("'", "\"")
	obj = json.loads(fixed_match)
	results.append(obj)
	except:
	continue
	if results:
	return results
	except:
	pass

	return None

	def generate_qa_from_pdf_chunk(self, chunk, num_questions=3, include_tags=True, difficulty_levels=True):
	"""Generate Q&A pairs from a PDF text chunk."""
	if not self.model or not self.tokenizer:
	return [], "Error: Model not loaded properly. Please try again with a different model."

	if not chunk or len(chunk.strip()) < 100: # Skip very small chunks
	return [], "Chunk too small to generate meaningful Q&A pairs."

	prompt = self.generate_qa_prompt(chunk, num_questions, include_tags, difficulty_levels)
	raw_outputs = self.generate_data(prompt, num_samples=1)
	raw_output = raw_outputs[0]

	parsed_data = self.parse_json_data(raw_output)

	# Ensure parsed data is a list
	if parsed_data and isinstance(parsed_data, dict):
	parsed_data = [parsed_data]

	# Return both the parsed data and raw output for debugging
	return parsed_data, raw_output

	def format_data_preview(data):
	"""Format the data for preview in the UI."""
	if isinstance(data, list):
	if len(data) > 0 and isinstance(data[0], dict):
	# Convert list of dicts to DataFrame for better display
	return pd.DataFrame(data).to_string()
	else:
	return json.dumps(data, indent=2)
	elif isinstance(data, dict):
	return json.dumps(data, indent=2)
	else:
	return str(data)

	def save_data(data, format, filename_prefix):
	"""Save data to a file in the specified format."""
	os.makedirs("synthetic_data", exist_ok=True)
	timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
	filename = f"synthetic_data/{filename_prefix}_{timestamp}"

	if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
	df = pd.DataFrame(data)

	if format.lower() == "csv":
	full_filename = f"{filename}.csv"
	df.to_csv(full_filename, index=False)
	elif format.lower() == "json":
	full_filename = f"{filename}.json"
	with open(full_filename, "w") as f:
	json.dump(data, f, indent=2)
	elif format.lower() == "excel":
	full_filename = f"{filename}.xlsx"
	df.to_excel(full_filename, index=False)
	else:
	full_filename = f"{filename}.txt"
	with open(full_filename, "w") as f:
	f.write(str(data))
	else:
	full_filename = f"{filename}.{format.lower()}"
	with open(full_filename, "w") as f:
	if format.lower() == "json":
	json.dump(data, f, indent=2)
	else:
	f.write(str(data))

	return full_filename

	def load_models():
	"""Return a list of available models."""
	return [
	"tiiuae/falcon-7b-instruct"
	]

	@spaces.GPU
	def process_pdf_generate_qa(pdf_file, model_name, num_questions_per_chunk, include_tags, include_difficulty, output_file_format, progress=None):
	"""Process a PDF file and generate Q&A pairs from its content."""
	if pdf_file is None:
	return None, "Error: No PDF file uploaded", "", "No file provided"

	try:
	# Check RAM usage at start
	current_ram_usage = get_process_memory_usage()
	print(f"Starting RAM usage: {current_ram_usage:.2f}GB")

	# Clear CUDA cache before starting
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# Initialize extractor and generator
	extractor = PdfExtractor()
	generator = SyntheticDataGenerator(model_name)

	# Wrap model loading in try-except to handle errors
	try:
	load_success = generator.load_model()
	if not load_success:
	return None, "Error: Failed to load the model. Please try again with a different model.", "", "Model loading failed"
	except Exception as e:
	if "ZeroGPU" in str(e) or "GPU task aborted" in str(e) or "CUDA" in str(e):
	print(f"GPU error during model loading: {e}. Trying with a smaller model...")
	# If we get a ZeroGPU error, immediately try the smallest model
	generator.model_name = "tiiuae/falcon-7b-instruct" # Use default model as fallback
	load_success = generator.load_model()
	if not load_success:
	return None, "Error: Failed to load any model even after fallback. Please try again later.", "", "Model loading failed"
	else:
	# Re-raise other errors
	raise

	# Check RAM usage after model loading
	ram_after_model = get_process_memory_usage()
	print(f"RAM usage after model loading: {ram_after_model:.2f}GB")

	# Save PDF temporarily if it's a file object
	if hasattr(pdf_file, 'name'):
	# It's already a file path
	pdf_path = pdf_file.name
	else:
	# Create a temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
	tmp.write(pdf_file)
	pdf_path = tmp.name

	# Extract text from PDF
	pdf_text = extractor.extract_text_from_pdf(pdf_path)

	if not pdf_text:
	return None, "Failed to extract text from PDF", "", "No data generated"

	# Clean and chunk the text - reduce chunk size to use less memory
	cleaned_text = extractor.clean_text(pdf_text)
	chunks = extractor.chunk_text(cleaned_text, max_chunk_size=400, overlap=30)

	# Check RAM after PDF processing
	ram_after_pdf = get_process_memory_usage()
	print(f"RAM usage after PDF processing: {ram_after_pdf:.2f}GB, found {len(chunks)} chunks")

	# If we're approaching the RAM limit already, reduce batch size
	batch_size = 3 # Default
	if ram_after_pdf > MAX_RAM_GB * 0.7: # If already using 70% of our limit
	batch_size = 1 # Process one chunk at a time
	print(f"High RAM usage detected ({ram_after_pdf:.2f}GB), reducing batch size to 1")
	elif ram_after_pdf > MAX_RAM_GB * 0.5: # If using 50% of our limit
	batch_size = 2 # Process two chunks at a time
	print(f"Moderate RAM usage detected ({ram_after_pdf:.2f}GB), reducing batch size to 2")

	# Generate Q&A pairs for each chunk
	all_qa_pairs = []
	all_raw_outputs = []

	total_chunks = len(chunks)

	# Process chunks in smaller batches to avoid memory buildup
	for i in range(0, total_chunks, batch_size):
	# Get the current batch of chunks
	batch_chunks = chunks[i:min(i+batch_size, total_chunks)]

	# Process each chunk in the batch
	for j, chunk in enumerate(batch_chunks):
	chunk_index = i + j

	if progress is not None:
	progress(chunk_index / total_chunks, f"Processing chunk {chunk_index+1}/{total_chunks}")

	# Check if we're approaching RAM limit
	current_ram = get_process_memory_usage()
	if current_ram > MAX_RAM_GB * 0.9: # Over 90% of our limit
	print(f"WARNING: High RAM usage detected: {current_ram:.2f}GB - force releasing memory")
	import gc
	gc.collect() # Force garbage collection
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# If still too high after garbage collection, abort batch processing
	current_ram = get_process_memory_usage()
	if current_ram > MAX_RAM_GB * 0.95: # Still dangerously high
	print(f"CRITICAL: RAM usage too high ({current_ram:.2f}GB), stopping processing")
	break

	# Clear CUDA cache between chunks
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	try:
	qa_pairs, raw_output = generator.generate_qa_from_pdf_chunk(
	chunk,
	num_questions=num_questions_per_chunk,
	include_tags=include_tags,
	difficulty_levels=include_difficulty
	)
	except Exception as e:
	error_type = str(e)
	if "CUDA" in error_type or "GPU" in error_type or "ZeroGPU" in error_type:
	print(f"GPU error during generation for chunk {chunk_index+1}: {e}")
	# Fall back to CPU for this specific generation
	raw_output = f"Error in chunk {chunk_index+1}: {str(e)}. Skipping..."
	qa_pairs = None
	elif "memory" in error_type.lower() or "ram" in error_type.lower():
	print(f"Memory error processing chunk {chunk_index+1}: {e}")
	# Force garbage collection and skip chunk
	import gc
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	raw_output = f"Memory error in chunk {chunk_index+1}: {str(e)}. Skipping..."
	qa_pairs = None
	else:
	# For other errors, just log and continue
	print(f"Error processing chunk {chunk_index+1}: {e}")
	raw_output = f"Error in chunk {chunk_index+1}: {str(e)}"
	qa_pairs = None

	if qa_pairs:
	all_qa_pairs.extend(qa_pairs)
	all_raw_outputs.append(raw_output)

	# Check RAM usage after processing this chunk
	current_ram = get_process_memory_usage()
	print(f"RAM after chunk {chunk_index+1}: {current_ram:.2f}GB")

	# Do a thorough cleanup after each batch
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# Force garbage collection between batches
	import gc
	gc.collect()

	# Check if we need to abort due to memory constraints
	current_ram = get_process_memory_usage()
	if current_ram > MAX_RAM_GB:
	print(f"WARNING: Exceeding RAM limit ({current_ram:.2f}GB). Stopping further processing.")
	if progress is not None:
	progress(1.0, f"Stopped early due to high memory usage ({current_ram:.2f}GB)")
	break

	if progress is not None:
	progress(1.0, "Finished processing")

	# Final cache clear and garbage collection
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	import gc
	gc.collect()

	if not all_qa_pairs:
	return None, "Failed to generate Q&A pairs", "\n\n".join(all_raw_outputs), "No data generated"

	# Save data to file
	filename = save_data(
	all_qa_pairs,
	output_file_format,
	"qa_dataset"
	)

	# Format for display
	formatted_data = format_data_preview(all_qa_pairs)

	# Final memory report
	final_ram = get_process_memory_usage()
	print(f"Final RAM usage: {final_ram:.2f}GB")

	return all_qa_pairs, formatted_data, "\n\n".join(all_raw_outputs), f"Data saved to {filename}"
	except Exception as e:
	error_msg = f"Error processing PDF: {str(e)}"
	print(error_msg)
	import traceback
	print(traceback.format_exc())
	return None, error_msg, "", "Processing failed"

	# Set up the Gradio interface
	def create_interface():
	with gr.Blocks(title="PDF Q&A Dataset Generator") as app:
	gr.Markdown("# 📚 PDF Q&A Dataset Generator")
	gr.Markdown("""
	Generate question & answer datasets from PDF documents using instruction-tuned language models.
	Perfect for creating educational resources, quiz materials, or training data for Q&A systems.
	""")

	with gr.Tabs() as tabs:
	with gr.TabItem("Generate Q&A Dataset"):
	with gr.Row():
	with gr.Column(scale=1):
	pdf_file = gr.File(
	label="Upload PDF",
	file_types=[".pdf"],
	type="binary"
	)

	model_dropdown = gr.Dropdown(
	choices=load_models(),
	value=DEFAULT_MODEL,
	label="Model"
	)

	num_questions = gr.Slider(
	minimum=1,
	maximum=5,
	value=3,
	step=1,
	label="Questions per Section"
	)

	include_tags = gr.Checkbox(
	value=True,
	label="Include Tags"
	)

	include_difficulty = gr.Checkbox(
	value=True,
	label="Include Difficulty Levels"
	)

	output_file_format = gr.Radio(
	choices=["json", "csv", "excel"],
	value="json",
	label="Save File Format"
	)

	generate_btn = gr.Button("Generate Q&A Dataset", variant="primary")

	progress_bar = gr.Progress()

	with gr.Column(scale=2):
	with gr.Tab("Parsed Data"):
	parsed_data_output = gr.JSON(label="Generated Q&A Pairs")
	formatted_data_output = gr.Textbox(
	label="Formatted Preview",
	lines=15
	)

	with gr.Tab("Raw Output"):
	raw_output = gr.Textbox(
	label="Raw Model Output",
	lines=15
	)

	file_output = gr.Textbox(label="File Output")

	with gr.TabItem("Documentation"):
	gr.Markdown("""
	## How to Use

	1. Upload a PDF: Select a PDF document containing the content you want to generate questions from.
	2. Select a model: Choose an instruction-tuned language model from the dropdown.
	3. Configure settings:
	- Set the number of questions to generate per text section
	- Choose whether to include tags and difficulty levels
	- Select your preferred output file format
	4. Generate dataset: Click the "Generate Q&A Dataset" button to create your dataset.

	## About This App

	This app uses instruction-tuned language models to generate question and answer pairs from PDF documents. It:

	1. Extracts text from the uploaded PDF
	2. Splits the text into manageable chunks
	3. Generates questions, answers, tags, and difficulty levels for each chunk
	4. Combines all Q&A pairs into a comprehensive dataset

	### Features:
	- Automatic text extraction from PDFs
	- Smart text chunking to maintain context
	- Customizable number of questions per chunk
	- Optional tagging and difficulty classification
	- Multiple output formats (JSON, CSV, Excel)

	### Use Cases:
	- Create educational resources and quiz materials
	- Generate training data for Q&A systems
	- Build flashcard datasets for studying
	- Develop content for educational applications
	""")

	with gr.TabItem("Status"):
	gr.Markdown("""
	## System Status

	This app runs on CPU mode. Some larger models might be slower to load and generate content.
	If you encounter any issues with a specific model, try switching to a smaller model like `tiiuae/falcon-7b-instruct`.

	### Troubleshooting

	- If the app seems unresponsive after clicking "Generate", please be patient - model loading may take time.
	- If you get an error about model loading, try refreshing the page and selecting a different model.
	- Not all PDFs can be properly processed - if text extraction fails, try with a different PDF.
	""")

	# Event handler for generate button
	generate_btn.click(
	process_pdf_generate_qa,
	inputs=[
	pdf_file,
	model_dropdown,
	num_questions,
	include_tags,
	include_difficulty,
	output_file_format
	],
	outputs=[parsed_data_output, formatted_data_output, raw_output, file_output],
	show_progress=True
	)

	return app

	# Export the app for Hugging Face Spaces
	app = create_interface()

	# Launch the app depending on the environment
	if __name__ == "__main__":
	app.launch()