Spaces:

pierreguillou
/

eduhtml-creator-apple-style

Sleeping

App Files Files Community

eduhtml-creator-apple-style / app.py

pierreguillou

Update app.py

7d5cf64 verified about 1 month ago

raw

history blame

14 kB

	import gradio as gr
	import os
	import time
	import requests
	from datetime import datetime
	from langchain_openai import ChatOpenAI
	from langchain_anthropic import ChatAnthropic
	from langchain_google_genai import ChatGoogleGenerativeAI
	from langchain_core.messages import HumanMessage
	from langchain_core.caches import BaseCache
	from langchain_core.callbacks import Callbacks
	ChatGoogleGenerativeAI.model_rebuild()
	import pandas as pd
	import io
	import tempfile
	from urllib.parse import urlparse
	import re

	# Import DocLing and necessary configuration classes
	from docling.document_converter import DocumentConverter, PdfFormatOption
	from docling.datamodel.pipeline_options import PdfPipelineOptions
	from docling.datamodel.base_models import InputFormat

	# Import and rebuild ChatGoogleGenerativeAI deferred
	try:
	from langchain_google_genai import ChatGoogleGenerativeAI
	from langchain_core.caches import BaseCache
	ChatGoogleGenerativeAI.model_rebuild()
	except Exception as e:
	print(f"Warning during rebuild: {e}")
	from langchain_google_genai import ChatGoogleGenerativeAI

	# --- START OF OCR CONFIGURATION ---
	# Create a single, pre-configured DocumentConverter instance to be reused.
	# This is more efficient than creating it on every function call.

	# 1. Define the pipeline options to enable OCR for PDFs.
	# Configure a single global DocLing converter with Tesseract OCR enabled and all languages
	# Note: With tesseract-ocr-all installed, all language data files are available.
	pdf_options = PdfPipelineOptions(
	do_ocr=True,
	ocr_model="tesseract",
	# Provide a broad default set. With tesseract-ocr-all, many language packs exist.
	# You can keep this small for speed or expand it. Here we include a practical wide set.
	ocr_languages=[
	"eng","fra","deu","spa","ita","por","nld","pol","tur","ces","rus","ukr","ell","ron","hun",
	"bul","hrv","srp","slk","slv","lit","lav","est","cat","eus","glg","isl","dan","nor","swe",
	"fin","alb","mlt","afr","zul","swa","amh","uzb","aze","kaz","kir","mon","tgl","ind","msa",
	"tha","vie","khm","lao","mya","ben","hin","mar","guj","pan","mal","tam","tel","kan","nep",
	"sin","urd","fas","pus","kur","aze_cyrl","tat","uig","heb","ara","yid","grc","chr","epo",
	"hye","kat","kat_old","aze_latn","mkd","bel","srp_latn","srp_cyrillic",
	# CJK — these are heavier and slower; include only if needed:
	"chi_sim","chi_tra","jpn","kor"
	]
	)

	# 2. Create the format-specific configuration.
	format_options = {
	InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)
	}

	# 3. Initialize the converter with the OCR configuration.
	# This converter will now automatically perform OCR on any PDF file.
	docling_converter = DocumentConverter(format_options=format_options)
	# --- END OF OCR CONFIGURATION ---

	# Model configuration
	MODELS = {
	"Gemini 2.5 Flash (Google AI)": {
	"provider": "Google AI",
	"class": ChatGoogleGenerativeAI,
	"model_name": "gemini-2.0-flash-exp",
	"default_api": True
	},
	"ChatGPT 5 (OpenAI)": {
	"provider": "OpenAI",
	"class": ChatOpenAI,
	"model_name": "gpt-4o",
	"default_api": False
	},
	"Claude Sonnet 4 (Anthropic)": {
	"provider": "Anthropic",
	"class": ChatAnthropic,
	"model_name": "claude-3-5-sonnet-20241022",
	"default_api": False
	},
	"Gemini 2.5 Pro (Google AI)": {
	"provider": "Google AI",
	"class": ChatGoogleGenerativeAI,
	"model_name": "gemini-2.0-flash-exp",
	"default_api": False
	}
	}

	# Default API for Gemini 2.5 Flash via HF Spaces Secrets
	DEFAULT_GEMINI_API = os.getenv("FLASH_GOOGLE_API_KEY")

	def extract_text_from_file(file):
	"""
	Extract text from an uploaded file or path (str).
	- Accepts an object with .name attribute (e.g. Gradio upload) OR a file path (str).
	- DocLing for: .pdf (Tesseract OCR enabled if configured), .docx, .xlsx, .pptx
	- Converts .csv /.xls -> temporary .xlsx then DocLing
	- .txt read directly
	"""
	if file is None:
	return ""

	# Normalize to a filesystem path string
	path = file.name if hasattr(file, "name") else str(file)
	ext = os.path.splitext(path)[1].lower()

	docling_direct = {".pdf", ".docx", ".xlsx", ".pptx"}
	to_xlsx_first = {".csv", ".xls"}

	try:
	if ext in docling_direct:
	result = docling_converter.convert(path)
	return result.document.export_to_markdown()

	elif ext in to_xlsx_first:
	# Convert CSV/XLS -> XLSX
	if ext == ".csv":
	df = pd.read_csv(path)
	else: # .xls
	df = pd.read_excel(path)

	with tempfile.NamedTemporaryFile(delete=True, suffix=".xlsx") as tmp:
	df.to_excel(tmp.name, index=False)
	result = docling_converter.convert(tmp.name)
	return result.document.export_to_markdown()

	elif ext == ".txt":
	with open(path, "r", encoding="utf-8") as f:
	return f.read()

	else:
	return "Unsupported file format"
	except Exception as e:
	return f"Error reading file: {str(e)}"

	def extract_text_from_url(url):
	"""Extract text from a URL"""
	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	content = response.text
	content = re.sub(r'<[^>]+>', '', content)
	content = re.sub(r'\s+', ' ', content).strip()
	return content[:10000] # Limit to 10k characters
	except Exception as e:
	return f"Error retrieving URL: {str(e)}"

	def get_document_content(text_input, url_input, file_input):
	"""Retrieve document content based on source"""
	if text_input.strip():
	return text_input.strip()
	elif url_input.strip():
	return extract_text_from_url(url_input.strip())
	elif file_input is not None:
	return extract_text_from_file(file_input)
	else:
	return ""

	def create_llm_instance(model_name, api_key):
	"""Create an LLM model instance"""
	model_config = MODELS[model_name]
	if model_config["provider"] == "OpenAI":
	return model_config["class"](
	model=model_config["model_name"],
	api_key=api_key,
	temperature=0.7
	)
	elif model_config["provider"] == "Anthropic":
	return model_config["class"](
	model=model_config["model_name"],
	api_key=api_key,
	temperature=0.7
	)
	elif model_config["provider"] == "Google AI":
	api_to_use = api_key if api_key else DEFAULT_GEMINI_API
	return model_config["class"](
	model=model_config["model_name"],
	google_api_key=api_to_use,
	temperature=0.7
	)

	def generate_html(model_name, api_key, text_input, url_input, file_input):
	"""Generate educational HTML file"""
	start_time = time.time()
	if model_name != "Gemini 2.5 Flash (Google AI)" and not api_key.strip():
	return None, "❌ Error: Please provide an API key for this model.", 0

	document_content = get_document_content(text_input, url_input, file_input)
	if not document_content:
	return None, "❌ Error: Please provide a document (text, URL or file).", 0

	try:
	# Create LLM instance
	llm = create_llm_instance(model_name, api_key)

	# Read prompt template
	with open("creation_educational_html_from_any_document_18082025.txt", "r", encoding="utf-8") as f:
	prompt_template = f.read()

	# Replace variables
	model_config = MODELS[model_name]
	prompt = prompt_template.format(
	model_name=model_config["model_name"],
	provider_name=model_config["provider"],
	document=document_content
	)

	# Generate content
	message = HumanMessage(content=prompt)
	response = llm.invoke([message])
	html_content = response.content

	# Clean any code tags from models
	html_content = html_content.replace("```html", "")
	html_content = html_content.replace("```", "")

	# Calculate generation time
	generation_time = time.time() - start_time

	# Save HTML file
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename = f"educational_document_{timestamp}.html"
	with open(filename, "w", encoding="utf-8") as f:
	f.write(html_content)

	success_message = f"✅ HTML file generated successfully in {generation_time:.2f} seconds!"
	return filename, success_message, generation_time

	except Exception as e:
	error_message = f"❌ Error during generation: {str(e)}"
	return None, error_message, 0

	def reset_form():
	"""Reset the form to zero"""
	return (
	"Gemini 2.5 Flash (Google AI)", # model_name
	"", # api_key
	"", # text_input
	"", # url_input
	None, # file_input
	"", # status_message
	None, # html_file
	"" # html_preview
	)

	def update_api_info(model_name):
	"""Update API information based on selected model"""
	if model_name == "Gemini 2.5 Flash (Google AI)":
	return gr.update(
	label="API Key (optional)",
	placeholder="Free API available until exhausted, or use your own key",
	info="💡 A free API is already configured for this model. You can use your own key if you wish."
	)
	else:
	return gr.update(
	label="API Key (required)",
	placeholder="Enter your API key",
	info="🔑 API key required for this model"
	)

	# Gradio Interface (Apple-like)
	with gr.Blocks(
	title="EduHTML Creator - Educational HTML Content Generator",
	theme=gr.themes.Soft(),
	css="style.css",
	js="script.js"
	) as app:

	# Header hero (black, full-width look within container)
	gr.HTML("""
	<div class="header" role="banner">
	<div class="header-inner">
	<h1>🎓 EduHTML Creator</h1>
	<p>
	Transform any document into interactive educational HTML content, with a premium Apple-inspired design.
	Document fidelity, clear structure, interactivity, and highlighting of key information.
	</p>
	</div>
	</div>
	""")

	with gr.Column(elem_classes=["main-container"]):
	# Model Configuration Section
	gr.HTML("<div class='section'>")
	model_dropdown = gr.Dropdown(
	choices=list(MODELS.keys()),
	value="Gemini 2.5 Flash (Google AI)",
	label="LLM Model",
	info="Select the model to use for generation"
	)

	api_input = gr.Textbox(
	label="API Key (optional)",
	placeholder="Free API (Gemini Flash) available. You can enter your own key.",
	info="For OpenAI/Anthropic, a key is required.",
	type="password"
	)
	gr.HTML("</div>")

	# Document Source Section with tabs
	gr.HTML("<div class='section alt'>")
	gr.HTML("<h3>Document Source</h3>")

	with gr.Tabs():
	with gr.TabItem("📝 Text"):
	text_input = gr.Textbox(
	label="Copied/pasted text",
	placeholder="Paste your text here...",
	lines=4
	)

	with gr.TabItem("🌐 URL"):
	url_input = gr.Textbox(
	label="Web Link",
	placeholder="https://example.com/article"
	)

	with gr.TabItem("📁 File"):
	file_input = gr.File(
	label="File",
	file_types=[".pdf", ".txt", ".docx", ".xlsx", ".xls", ".pptx"]
	)

	gr.HTML("</div>")

	# Action buttons
	with gr.Row():
	submit_btn = gr.Button("Generate HTML", variant="primary", elem_classes=["apple-button"])
	reset_btn = gr.Button("Reset", elem_classes=["reset-button"])

	# Results Section
	status_output = gr.HTML(label="Status")
	gr.HTML("<div class='section preview-card'>")
	gr.HTML("<div class='preview-header'><div class='preview-dot' aria-hidden='true'></div><div>Preview</div></div>")
	html_preview = gr.HTML(label="Preview", visible=False, elem_id="html-preview", elem_classes=["preview-body"])
	html_file_output = gr.File(label="Downloadable HTML file", visible=False)
	gr.HTML("</div>")

	# Footer (black)
	gr.HTML("""
	<div class="footer" role="contentinfo">
	<div class="footer-inner">
	<span>Apple-inspired design • High contrasts • Smooth interactions</span>
	</div>
	</div>
	""")

	# Events
	model_dropdown.change(
	fn=update_api_info,
	inputs=[model_dropdown],
	outputs=[api_input]
	)

	submit_btn.click(
	fn=generate_html,
	inputs=[model_dropdown, api_input, text_input, url_input, file_input],
	outputs=[html_file_output, status_output, gr.State()]
	).then(
	fn=lambda file, status, _: (
	gr.update(visible=file is not None),
	status,
	gr.update(visible=file is not None, value=(open(file, 'r', encoding='utf-8').read() if file else ""))
	),
	inputs=[html_file_output, status_output, gr.State()],
	outputs=[html_file_output, status_output, html_preview]
	)

	reset_btn.click(
	fn=reset_form,
	outputs=[model_dropdown, api_input, text_input, url_input, file_input, status_output, html_file_output, html_preview]
	)

	if __name__ == "__main__":
	app.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True
	)