Spaces:
Sleeping
Sleeping
File size: 31,601 Bytes
9d0ffca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 |
import os
import json
import pandas as pd
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import csv
import yaml
from typing import List, Dict, Any
import random
from pypdf import PdfReader
import re
import tempfile
from huggingface_hub import HfApi
# Configuration
DEFAULT_MODEL = "tiiuae/falcon-7b-instruct" # Use Falcon-7B as the default model
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Try to use CUDA if available
MAX_NEW_TOKENS = 512
TEMPERATURE = 0.7
HF_TOKEN = os.environ.get("HF_TOKEN") if os.environ.get("HF_TOKEN") else None # Get token from environment variables
MAX_RAM_GB = 45 # Set maximum RAM usage to 45GB (below the 70GB limit)
# Create offload folder for model memory management
os.makedirs("offload_folder", exist_ok=True)
# Setup RAM monitoring
def get_process_memory_usage():
"""Get the current memory usage of this process in GB"""
import psutil
process = psutil.Process(os.getpid())
return process.memory_info().rss / (1024 * 1024 * 1024) # Convert to GB
class PdfExtractor:
"""Extract text content from PDF files"""
@staticmethod
def extract_text_from_pdf(pdf_file):
"""Extract text from a PDF file"""
try:
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
print(f"Error extracting text from PDF: {e}")
return None
@staticmethod
def clean_text(text):
"""Clean and preprocess extracted text"""
if not text:
return ""
# Replace multiple newlines with single newline
text = re.sub(r'\n+', '\n', text)
# Replace multiple spaces with single space
text = re.sub(r'\s+', ' ', text)
return text.strip()
@staticmethod
def chunk_text(text, max_chunk_size=1000, overlap=100):
"""Split text into chunks of specified size with overlap"""
if not text:
return []
chunks = []
start = 0
text_length = len(text)
while start < text_length:
end = min(start + max_chunk_size, text_length)
# If we're not at the end, try to break at a sentence or paragraph
if end < text_length:
# Look for sentence breaks (period, question mark, exclamation mark followed by space)
sentence_break = max(
text.rfind('. ', start, end),
text.rfind('? ', start, end),
text.rfind('! ', start, end),
text.rfind('\n', start, end)
)
if sentence_break > start + max_chunk_size // 2:
end = sentence_break + 1
chunks.append(text[start:end].strip())
start = end - overlap # Create overlap with previous chunk
return chunks
class SyntheticDataGenerator:
def __init__(self, model_name=DEFAULT_MODEL):
self.model_name = model_name
self.model = None
self.tokenizer = None
self.load_model() # Load the model directly during initialization
def load_model(self):
"""Load the specified model."""
# Clear CUDA cache if using GPU to prevent memory fragmentation
if torch.cuda.is_available():
torch.cuda.empty_cache()
try:
print(f"Loading model {self.model_name} on {DEVICE}...")
# Add token for authentication if available
tokenizer_kwargs = {}
model_kwargs = {
"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
"device_map": "auto" if torch.cuda.is_available() else None,
"low_cpu_mem_usage": True, # Added to reduce memory usage on CPU
"offload_folder": "offload_folder" # Add offload folder for large models
}
if HF_TOKEN:
tokenizer_kwargs["token"] = HF_TOKEN
model_kwargs["token"] = HF_TOKEN
print("Using Hugging Face token for authentication")
# Load tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, **tokenizer_kwargs)
# Load the model
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
**model_kwargs
)
# Ensure model is on the right device if not using device_map="auto"
if not torch.cuda.is_available():
self.model = self.model.to(DEVICE)
print(f"Model {self.model_name} loaded successfully on {DEVICE}")
except Exception as e:
print(f"Error loading model {self.model_name}: {e}")
self.model = None
self.tokenizer = None
raise
def generate_qa_prompt(self, context, num_questions=3, include_tags=True, difficulty_levels=True):
"""Generate a prompt for creating Q&A pairs from context."""
tag_instruction = ""
if include_tags:
tag_instruction = "Add 1-3 tags for each question that categorize the topic or subject matter."
difficulty_instruction = ""
if difficulty_levels:
difficulty_instruction = "For each question, assign a difficulty level (easy, medium, or hard)."
prompt = f"""Task: Based on the following text, generate {num_questions} question and answer pairs that would be useful for comprehension testing or knowledge assessment.
CONTEXT:
{context}
For each question:
1. Write a clear, specific question about the information in the text
2. Provide the correct answer to the question, citing relevant details from the text
3. {tag_instruction}
4. {difficulty_instruction}
Format each Q&A pair as a JSON object with the following structure:
{{
"question": "The question text",
"answer": "The answer text",
"tags": ["tag1", "tag2"],
"difficulty": "easy/medium/hard"
}}
Return all Q&A pairs in a JSON array.
"""
return prompt
def generate_data(self, prompt, num_samples=1):
"""Generate synthetic data using the loaded model."""
if not self.model or not self.tokenizer:
return ["Error: Model not loaded properly. Please try again with a different model."]
outputs = []
for sample_idx in range(num_samples):
try:
# Clear CUDA cache before generating to free up memory
if torch.cuda.is_available():
torch.cuda.empty_cache()
# ZeroGPU errors often occur in generate() calls
# To mitigate this, try multiple approaches in sequence
inputs = self.tokenizer(prompt, return_tensors="pt").to(DEVICE)
try:
# First try: Standard generation with conservative settings
with torch.no_grad():
output = self.model.generate(
**inputs,
max_new_tokens=MAX_NEW_TOKENS,
temperature=TEMPERATURE,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
num_beams=1, # Use greedy decoding instead of beam search
early_stopping=True,
no_repeat_ngram_size=3 # Prevent repetition
)
decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
except (RuntimeError, Exception) as e:
if "CUDA" in str(e) or "GPU" in str(e) or "ZeroGPU" in str(e):
print(f"GPU error during generation: {e}")
print("Falling back to CPU generation...")
# Move everything to CPU
inputs = {k: v.to('cpu') for k, v in inputs.items()}
# Create CPU copy of the model if we were using GPU
if torch.cuda.is_available():
# Temporarily move model to CPU for this generation
model_cpu = self.model.to('cpu')
with torch.no_grad():
output = model_cpu.generate(
**inputs,
max_new_tokens=MAX_NEW_TOKENS,
temperature=TEMPERATURE,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
num_return_sequences=1,
max_length=MAX_NEW_TOKENS + inputs['input_ids'].shape[1]
)
decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
# Move model back to CUDA for future calls
self.model = self.model.to(DEVICE)
else:
# Already on CPU, try with reduced parameters
with torch.no_grad():
output = self.model.generate(
**inputs,
max_new_tokens=min(256, MAX_NEW_TOKENS), # Reduce token count
temperature=0.5, # Lower temperature
do_sample=False, # No sampling
num_return_sequences=1,
pad_token_id=self.tokenizer.eos_token_id
)
decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
else:
# Re-raise non-CUDA errors
raise
# Extract only the generated part (remove prompt)
prompt_text = self.tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
generated_text = decoded_output[len(prompt_text):].strip()
outputs.append(generated_text)
# Clear CUDA cache between samples
if torch.cuda.is_available():
torch.cuda.empty_cache()
except Exception as e:
error_msg = f"Error generating sample {sample_idx+1}: {str(e)}"
print(error_msg)
outputs.append(f"Error: {error_msg}")
return outputs
def parse_json_data(self, generated_text):
"""Extract and parse JSON from generated text."""
try:
# Find JSON-like content (between [ and ])
start_idx = generated_text.find('[')
end_idx = generated_text.rfind(']') + 1
if start_idx >= 0 and end_idx > start_idx:
json_str = generated_text[start_idx:end_idx]
return json.loads(json_str)
# Try to find single object format
start_idx = generated_text.find('{')
end_idx = generated_text.rfind('}') + 1
if start_idx >= 0 and end_idx > start_idx:
json_str = generated_text[start_idx:end_idx]
return json.loads(json_str)
print(f"Could not find JSON content in: {generated_text}")
return None
except json.JSONDecodeError as e:
print(f"JSON parse error: {e}")
print(f"Problematic text: {generated_text}")
# Try to find and fix common JSON formatting errors
try:
# Replace single quotes with double quotes
json_str = generated_text[start_idx:end_idx].replace("'", "\"")
return json.loads(json_str)
except:
pass
# If still failing, try to extract individual JSON objects
try:
pattern = r'\{[^{}]*\}'
matches = re.findall(pattern, generated_text)
if matches:
results = []
for match in matches:
try:
# Replace single quotes with double quotes
fixed_match = match.replace("'", "\"")
obj = json.loads(fixed_match)
results.append(obj)
except:
continue
if results:
return results
except:
pass
return None
def generate_qa_from_pdf_chunk(self, chunk, num_questions=3, include_tags=True, difficulty_levels=True):
"""Generate Q&A pairs from a PDF text chunk."""
if not self.model or not self.tokenizer:
return [], "Error: Model not loaded properly. Please try again with a different model."
if not chunk or len(chunk.strip()) < 100: # Skip very small chunks
return [], "Chunk too small to generate meaningful Q&A pairs."
prompt = self.generate_qa_prompt(chunk, num_questions, include_tags, difficulty_levels)
raw_outputs = self.generate_data(prompt, num_samples=1)
raw_output = raw_outputs[0]
parsed_data = self.parse_json_data(raw_output)
# Ensure parsed data is a list
if parsed_data and isinstance(parsed_data, dict):
parsed_data = [parsed_data]
# Return both the parsed data and raw output for debugging
return parsed_data, raw_output
def format_data_preview(data):
"""Format the data for preview in the UI."""
if isinstance(data, list):
if len(data) > 0 and isinstance(data[0], dict):
# Convert list of dicts to DataFrame for better display
return pd.DataFrame(data).to_string()
else:
return json.dumps(data, indent=2)
elif isinstance(data, dict):
return json.dumps(data, indent=2)
else:
return str(data)
def save_data(data, format, filename_prefix):
"""Save data to a file in the specified format."""
os.makedirs("synthetic_data", exist_ok=True)
timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
filename = f"synthetic_data/{filename_prefix}_{timestamp}"
if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
df = pd.DataFrame(data)
if format.lower() == "csv":
full_filename = f"{filename}.csv"
df.to_csv(full_filename, index=False)
elif format.lower() == "json":
full_filename = f"{filename}.json"
with open(full_filename, "w") as f:
json.dump(data, f, indent=2)
elif format.lower() == "excel":
full_filename = f"{filename}.xlsx"
df.to_excel(full_filename, index=False)
else:
full_filename = f"{filename}.txt"
with open(full_filename, "w") as f:
f.write(str(data))
else:
full_filename = f"{filename}.{format.lower()}"
with open(full_filename, "w") as f:
if format.lower() == "json":
json.dump(data, f, indent=2)
else:
f.write(str(data))
return full_filename
def load_models():
"""Return a list of available models."""
return [
"tiiuae/falcon-7b-instruct"
]
@spaces.GPU
def process_pdf_generate_qa(pdf_file, model_name, num_questions_per_chunk, include_tags, include_difficulty, output_file_format, progress=None):
"""Process a PDF file and generate Q&A pairs from its content."""
if pdf_file is None:
return None, "Error: No PDF file uploaded", "", "No file provided"
try:
# Check RAM usage at start
current_ram_usage = get_process_memory_usage()
print(f"Starting RAM usage: {current_ram_usage:.2f}GB")
# Clear CUDA cache before starting
if torch.cuda.is_available():
torch.cuda.empty_cache()
# Initialize extractor and generator
extractor = PdfExtractor()
generator = SyntheticDataGenerator(model_name)
# Wrap model loading in try-except to handle errors
try:
load_success = generator.load_model()
if not load_success:
return None, "Error: Failed to load the model. Please try again with a different model.", "", "Model loading failed"
except Exception as e:
if "ZeroGPU" in str(e) or "GPU task aborted" in str(e) or "CUDA" in str(e):
print(f"GPU error during model loading: {e}. Trying with a smaller model...")
# If we get a ZeroGPU error, immediately try the smallest model
generator.model_name = "tiiuae/falcon-7b-instruct" # Use default model as fallback
load_success = generator.load_model()
if not load_success:
return None, "Error: Failed to load any model even after fallback. Please try again later.", "", "Model loading failed"
else:
# Re-raise other errors
raise
# Check RAM usage after model loading
ram_after_model = get_process_memory_usage()
print(f"RAM usage after model loading: {ram_after_model:.2f}GB")
# Save PDF temporarily if it's a file object
if hasattr(pdf_file, 'name'):
# It's already a file path
pdf_path = pdf_file.name
else:
# Create a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
tmp.write(pdf_file)
pdf_path = tmp.name
# Extract text from PDF
pdf_text = extractor.extract_text_from_pdf(pdf_path)
if not pdf_text:
return None, "Failed to extract text from PDF", "", "No data generated"
# Clean and chunk the text - reduce chunk size to use less memory
cleaned_text = extractor.clean_text(pdf_text)
chunks = extractor.chunk_text(cleaned_text, max_chunk_size=400, overlap=30)
# Check RAM after PDF processing
ram_after_pdf = get_process_memory_usage()
print(f"RAM usage after PDF processing: {ram_after_pdf:.2f}GB, found {len(chunks)} chunks")
# If we're approaching the RAM limit already, reduce batch size
batch_size = 3 # Default
if ram_after_pdf > MAX_RAM_GB * 0.7: # If already using 70% of our limit
batch_size = 1 # Process one chunk at a time
print(f"High RAM usage detected ({ram_after_pdf:.2f}GB), reducing batch size to 1")
elif ram_after_pdf > MAX_RAM_GB * 0.5: # If using 50% of our limit
batch_size = 2 # Process two chunks at a time
print(f"Moderate RAM usage detected ({ram_after_pdf:.2f}GB), reducing batch size to 2")
# Generate Q&A pairs for each chunk
all_qa_pairs = []
all_raw_outputs = []
total_chunks = len(chunks)
# Process chunks in smaller batches to avoid memory buildup
for i in range(0, total_chunks, batch_size):
# Get the current batch of chunks
batch_chunks = chunks[i:min(i+batch_size, total_chunks)]
# Process each chunk in the batch
for j, chunk in enumerate(batch_chunks):
chunk_index = i + j
if progress is not None:
progress(chunk_index / total_chunks, f"Processing chunk {chunk_index+1}/{total_chunks}")
# Check if we're approaching RAM limit
current_ram = get_process_memory_usage()
if current_ram > MAX_RAM_GB * 0.9: # Over 90% of our limit
print(f"WARNING: High RAM usage detected: {current_ram:.2f}GB - force releasing memory")
import gc
gc.collect() # Force garbage collection
if torch.cuda.is_available():
torch.cuda.empty_cache()
# If still too high after garbage collection, abort batch processing
current_ram = get_process_memory_usage()
if current_ram > MAX_RAM_GB * 0.95: # Still dangerously high
print(f"CRITICAL: RAM usage too high ({current_ram:.2f}GB), stopping processing")
break
# Clear CUDA cache between chunks
if torch.cuda.is_available():
torch.cuda.empty_cache()
try:
qa_pairs, raw_output = generator.generate_qa_from_pdf_chunk(
chunk,
num_questions=num_questions_per_chunk,
include_tags=include_tags,
difficulty_levels=include_difficulty
)
except Exception as e:
error_type = str(e)
if "CUDA" in error_type or "GPU" in error_type or "ZeroGPU" in error_type:
print(f"GPU error during generation for chunk {chunk_index+1}: {e}")
# Fall back to CPU for this specific generation
raw_output = f"Error in chunk {chunk_index+1}: {str(e)}. Skipping..."
qa_pairs = None
elif "memory" in error_type.lower() or "ram" in error_type.lower():
print(f"Memory error processing chunk {chunk_index+1}: {e}")
# Force garbage collection and skip chunk
import gc
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
raw_output = f"Memory error in chunk {chunk_index+1}: {str(e)}. Skipping..."
qa_pairs = None
else:
# For other errors, just log and continue
print(f"Error processing chunk {chunk_index+1}: {e}")
raw_output = f"Error in chunk {chunk_index+1}: {str(e)}"
qa_pairs = None
if qa_pairs:
all_qa_pairs.extend(qa_pairs)
all_raw_outputs.append(raw_output)
# Check RAM usage after processing this chunk
current_ram = get_process_memory_usage()
print(f"RAM after chunk {chunk_index+1}: {current_ram:.2f}GB")
# Do a thorough cleanup after each batch
if torch.cuda.is_available():
torch.cuda.empty_cache()
# Force garbage collection between batches
import gc
gc.collect()
# Check if we need to abort due to memory constraints
current_ram = get_process_memory_usage()
if current_ram > MAX_RAM_GB:
print(f"WARNING: Exceeding RAM limit ({current_ram:.2f}GB). Stopping further processing.")
if progress is not None:
progress(1.0, f"Stopped early due to high memory usage ({current_ram:.2f}GB)")
break
if progress is not None:
progress(1.0, "Finished processing")
# Final cache clear and garbage collection
if torch.cuda.is_available():
torch.cuda.empty_cache()
import gc
gc.collect()
if not all_qa_pairs:
return None, "Failed to generate Q&A pairs", "\n\n".join(all_raw_outputs), "No data generated"
# Save data to file
filename = save_data(
all_qa_pairs,
output_file_format,
"qa_dataset"
)
# Format for display
formatted_data = format_data_preview(all_qa_pairs)
# Final memory report
final_ram = get_process_memory_usage()
print(f"Final RAM usage: {final_ram:.2f}GB")
return all_qa_pairs, formatted_data, "\n\n".join(all_raw_outputs), f"Data saved to {filename}"
except Exception as e:
error_msg = f"Error processing PDF: {str(e)}"
print(error_msg)
import traceback
print(traceback.format_exc())
return None, error_msg, "", "Processing failed"
# Set up the Gradio interface
def create_interface():
with gr.Blocks(title="PDF Q&A Dataset Generator") as app:
gr.Markdown("# π PDF Q&A Dataset Generator")
gr.Markdown("""
Generate question & answer datasets from PDF documents using instruction-tuned language models.
Perfect for creating educational resources, quiz materials, or training data for Q&A systems.
""")
with gr.Tabs() as tabs:
with gr.TabItem("Generate Q&A Dataset"):
with gr.Row():
with gr.Column(scale=1):
pdf_file = gr.File(
label="Upload PDF",
file_types=[".pdf"],
type="binary"
)
model_dropdown = gr.Dropdown(
choices=load_models(),
value=DEFAULT_MODEL,
label="Model"
)
num_questions = gr.Slider(
minimum=1,
maximum=5,
value=3,
step=1,
label="Questions per Section"
)
include_tags = gr.Checkbox(
value=True,
label="Include Tags"
)
include_difficulty = gr.Checkbox(
value=True,
label="Include Difficulty Levels"
)
output_file_format = gr.Radio(
choices=["json", "csv", "excel"],
value="json",
label="Save File Format"
)
generate_btn = gr.Button("Generate Q&A Dataset", variant="primary")
progress_bar = gr.Progress()
with gr.Column(scale=2):
with gr.Tab("Parsed Data"):
parsed_data_output = gr.JSON(label="Generated Q&A Pairs")
formatted_data_output = gr.Textbox(
label="Formatted Preview",
lines=15
)
with gr.Tab("Raw Output"):
raw_output = gr.Textbox(
label="Raw Model Output",
lines=15
)
file_output = gr.Textbox(label="File Output")
with gr.TabItem("Documentation"):
gr.Markdown("""
## How to Use
1. **Upload a PDF**: Select a PDF document containing the content you want to generate questions from.
2. **Select a model**: Choose an instruction-tuned language model from the dropdown.
3. **Configure settings**:
- Set the number of questions to generate per text section
- Choose whether to include tags and difficulty levels
- Select your preferred output file format
4. **Generate dataset**: Click the "Generate Q&A Dataset" button to create your dataset.
## About This App
This app uses instruction-tuned language models to generate question and answer pairs from PDF documents. It:
1. Extracts text from the uploaded PDF
2. Splits the text into manageable chunks
3. Generates questions, answers, tags, and difficulty levels for each chunk
4. Combines all Q&A pairs into a comprehensive dataset
### Features:
- Automatic text extraction from PDFs
- Smart text chunking to maintain context
- Customizable number of questions per chunk
- Optional tagging and difficulty classification
- Multiple output formats (JSON, CSV, Excel)
### Use Cases:
- Create educational resources and quiz materials
- Generate training data for Q&A systems
- Build flashcard datasets for studying
- Develop content for educational applications
""")
with gr.TabItem("Status"):
gr.Markdown("""
## System Status
This app runs on CPU mode. Some larger models might be slower to load and generate content.
If you encounter any issues with a specific model, try switching to a smaller model like `tiiuae/falcon-7b-instruct`.
### Troubleshooting
- If the app seems unresponsive after clicking "Generate", please be patient - model loading may take time.
- If you get an error about model loading, try refreshing the page and selecting a different model.
- Not all PDFs can be properly processed - if text extraction fails, try with a different PDF.
""")
# Event handler for generate button
generate_btn.click(
process_pdf_generate_qa,
inputs=[
pdf_file,
model_dropdown,
num_questions,
include_tags,
include_difficulty,
output_file_format
],
outputs=[parsed_data_output, formatted_data_output, raw_output, file_output],
show_progress=True
)
return app
# Export the app for Hugging Face Spaces
app = create_interface()
# Launch the app depending on the environment
if __name__ == "__main__":
app.launch()
|