pierreguillou's picture
Update app.py
7d5cf64 verified
raw
history blame
14 kB
import gradio as gr
import os
import time
import requests
from datetime import datetime
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage
from langchain_core.caches import BaseCache
from langchain_core.callbacks import Callbacks
ChatGoogleGenerativeAI.model_rebuild()
import pandas as pd
import io
import tempfile
from urllib.parse import urlparse
import re
# Import DocLing and necessary configuration classes
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
# Import and rebuild ChatGoogleGenerativeAI deferred
try:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.caches import BaseCache
ChatGoogleGenerativeAI.model_rebuild()
except Exception as e:
print(f"Warning during rebuild: {e}")
from langchain_google_genai import ChatGoogleGenerativeAI
# --- START OF OCR CONFIGURATION ---
# Create a single, pre-configured DocumentConverter instance to be reused.
# This is more efficient than creating it on every function call.
# 1. Define the pipeline options to enable OCR for PDFs.
# Configure a single global DocLing converter with Tesseract OCR enabled and all languages
# Note: With tesseract-ocr-all installed, all language data files are available.
pdf_options = PdfPipelineOptions(
do_ocr=True,
ocr_model="tesseract",
# Provide a broad default set. With tesseract-ocr-all, many language packs exist.
# You can keep this small for speed or expand it. Here we include a practical wide set.
ocr_languages=[
"eng","fra","deu","spa","ita","por","nld","pol","tur","ces","rus","ukr","ell","ron","hun",
"bul","hrv","srp","slk","slv","lit","lav","est","cat","eus","glg","isl","dan","nor","swe",
"fin","alb","mlt","afr","zul","swa","amh","uzb","aze","kaz","kir","mon","tgl","ind","msa",
"tha","vie","khm","lao","mya","ben","hin","mar","guj","pan","mal","tam","tel","kan","nep",
"sin","urd","fas","pus","kur","aze_cyrl","tat","uig","heb","ara","yid","grc","chr","epo",
"hye","kat","kat_old","aze_latn","mkd","bel","srp_latn","srp_cyrillic",
# CJK — these are heavier and slower; include only if needed:
"chi_sim","chi_tra","jpn","kor"
]
)
# 2. Create the format-specific configuration.
format_options = {
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)
}
# 3. Initialize the converter with the OCR configuration.
# This converter will now automatically perform OCR on any PDF file.
docling_converter = DocumentConverter(format_options=format_options)
# --- END OF OCR CONFIGURATION ---
# Model configuration
MODELS = {
"Gemini 2.5 Flash (Google AI)": {
"provider": "Google AI",
"class": ChatGoogleGenerativeAI,
"model_name": "gemini-2.0-flash-exp",
"default_api": True
},
"ChatGPT 5 (OpenAI)": {
"provider": "OpenAI",
"class": ChatOpenAI,
"model_name": "gpt-4o",
"default_api": False
},
"Claude Sonnet 4 (Anthropic)": {
"provider": "Anthropic",
"class": ChatAnthropic,
"model_name": "claude-3-5-sonnet-20241022",
"default_api": False
},
"Gemini 2.5 Pro (Google AI)": {
"provider": "Google AI",
"class": ChatGoogleGenerativeAI,
"model_name": "gemini-2.0-flash-exp",
"default_api": False
}
}
# Default API for Gemini 2.5 Flash via HF Spaces Secrets
DEFAULT_GEMINI_API = os.getenv("FLASH_GOOGLE_API_KEY")
def extract_text_from_file(file):
"""
Extract text from an uploaded file or path (str).
- Accepts an object with .name attribute (e.g. Gradio upload) OR a file path (str).
- DocLing for: .pdf (Tesseract OCR enabled if configured), .docx, .xlsx, .pptx
- Converts .csv /.xls -> temporary .xlsx then DocLing
- .txt read directly
"""
if file is None:
return ""
# Normalize to a filesystem path string
path = file.name if hasattr(file, "name") else str(file)
ext = os.path.splitext(path)[1].lower()
docling_direct = {".pdf", ".docx", ".xlsx", ".pptx"}
to_xlsx_first = {".csv", ".xls"}
try:
if ext in docling_direct:
result = docling_converter.convert(path)
return result.document.export_to_markdown()
elif ext in to_xlsx_first:
# Convert CSV/XLS -> XLSX
if ext == ".csv":
df = pd.read_csv(path)
else: # .xls
df = pd.read_excel(path)
with tempfile.NamedTemporaryFile(delete=True, suffix=".xlsx") as tmp:
df.to_excel(tmp.name, index=False)
result = docling_converter.convert(tmp.name)
return result.document.export_to_markdown()
elif ext == ".txt":
with open(path, "r", encoding="utf-8") as f:
return f.read()
else:
return "Unsupported file format"
except Exception as e:
return f"Error reading file: {str(e)}"
def extract_text_from_url(url):
"""Extract text from a URL"""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
content = response.text
content = re.sub(r'<[^>]+>', '', content)
content = re.sub(r'\s+', ' ', content).strip()
return content[:10000] # Limit to 10k characters
except Exception as e:
return f"Error retrieving URL: {str(e)}"
def get_document_content(text_input, url_input, file_input):
"""Retrieve document content based on source"""
if text_input.strip():
return text_input.strip()
elif url_input.strip():
return extract_text_from_url(url_input.strip())
elif file_input is not None:
return extract_text_from_file(file_input)
else:
return ""
def create_llm_instance(model_name, api_key):
"""Create an LLM model instance"""
model_config = MODELS[model_name]
if model_config["provider"] == "OpenAI":
return model_config["class"](
model=model_config["model_name"],
api_key=api_key,
temperature=0.7
)
elif model_config["provider"] == "Anthropic":
return model_config["class"](
model=model_config["model_name"],
api_key=api_key,
temperature=0.7
)
elif model_config["provider"] == "Google AI":
api_to_use = api_key if api_key else DEFAULT_GEMINI_API
return model_config["class"](
model=model_config["model_name"],
google_api_key=api_to_use,
temperature=0.7
)
def generate_html(model_name, api_key, text_input, url_input, file_input):
"""Generate educational HTML file"""
start_time = time.time()
if model_name != "Gemini 2.5 Flash (Google AI)" and not api_key.strip():
return None, "❌ Error: Please provide an API key for this model.", 0
document_content = get_document_content(text_input, url_input, file_input)
if not document_content:
return None, "❌ Error: Please provide a document (text, URL or file).", 0
try:
# Create LLM instance
llm = create_llm_instance(model_name, api_key)
# Read prompt template
with open("creation_educational_html_from_any_document_18082025.txt", "r", encoding="utf-8") as f:
prompt_template = f.read()
# Replace variables
model_config = MODELS[model_name]
prompt = prompt_template.format(
model_name=model_config["model_name"],
provider_name=model_config["provider"],
document=document_content
)
# Generate content
message = HumanMessage(content=prompt)
response = llm.invoke([message])
html_content = response.content
# Clean any code tags from models
html_content = html_content.replace("```html", "")
html_content = html_content.replace("```", "")
# Calculate generation time
generation_time = time.time() - start_time
# Save HTML file
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"educational_document_{timestamp}.html"
with open(filename, "w", encoding="utf-8") as f:
f.write(html_content)
success_message = f"✅ HTML file generated successfully in {generation_time:.2f} seconds!"
return filename, success_message, generation_time
except Exception as e:
error_message = f"❌ Error during generation: {str(e)}"
return None, error_message, 0
def reset_form():
"""Reset the form to zero"""
return (
"Gemini 2.5 Flash (Google AI)", # model_name
"", # api_key
"", # text_input
"", # url_input
None, # file_input
"", # status_message
None, # html_file
"" # html_preview
)
def update_api_info(model_name):
"""Update API information based on selected model"""
if model_name == "Gemini 2.5 Flash (Google AI)":
return gr.update(
label="API Key (optional)",
placeholder="Free API available until exhausted, or use your own key",
info="💡 A free API is already configured for this model. You can use your own key if you wish."
)
else:
return gr.update(
label="API Key (required)",
placeholder="Enter your API key",
info="🔑 API key required for this model"
)
# Gradio Interface (Apple-like)
with gr.Blocks(
title="EduHTML Creator - Educational HTML Content Generator",
theme=gr.themes.Soft(),
css="style.css",
js="script.js"
) as app:
# Header hero (black, full-width look within container)
gr.HTML("""
<div class="header" role="banner">
<div class="header-inner">
<h1>🎓 EduHTML Creator</h1>
<p>
Transform any document into interactive educational HTML content, with a premium Apple-inspired design.
Document fidelity, clear structure, interactivity, and highlighting of key information.
</p>
</div>
</div>
""")
with gr.Column(elem_classes=["main-container"]):
# Model Configuration Section
gr.HTML("<div class='section'>")
model_dropdown = gr.Dropdown(
choices=list(MODELS.keys()),
value="Gemini 2.5 Flash (Google AI)",
label="LLM Model",
info="Select the model to use for generation"
)
api_input = gr.Textbox(
label="API Key (optional)",
placeholder="Free API (Gemini Flash) available. You can enter your own key.",
info="For OpenAI/Anthropic, a key is required.",
type="password"
)
gr.HTML("</div>")
# Document Source Section with tabs
gr.HTML("<div class='section alt'>")
gr.HTML("<h3>Document Source</h3>")
with gr.Tabs():
with gr.TabItem("📝 Text"):
text_input = gr.Textbox(
label="Copied/pasted text",
placeholder="Paste your text here...",
lines=4
)
with gr.TabItem("🌐 URL"):
url_input = gr.Textbox(
label="Web Link",
placeholder="https://example.com/article"
)
with gr.TabItem("📁 File"):
file_input = gr.File(
label="File",
file_types=[".pdf", ".txt", ".docx", ".xlsx", ".xls", ".pptx"]
)
gr.HTML("</div>")
# Action buttons
with gr.Row():
submit_btn = gr.Button("Generate HTML", variant="primary", elem_classes=["apple-button"])
reset_btn = gr.Button("Reset", elem_classes=["reset-button"])
# Results Section
status_output = gr.HTML(label="Status")
gr.HTML("<div class='section preview-card'>")
gr.HTML("<div class='preview-header'><div class='preview-dot' aria-hidden='true'></div><div>Preview</div></div>")
html_preview = gr.HTML(label="Preview", visible=False, elem_id="html-preview", elem_classes=["preview-body"])
html_file_output = gr.File(label="Downloadable HTML file", visible=False)
gr.HTML("</div>")
# Footer (black)
gr.HTML("""
<div class="footer" role="contentinfo">
<div class="footer-inner">
<span>Apple-inspired design • High contrasts • Smooth interactions</span>
</div>
</div>
""")
# Events
model_dropdown.change(
fn=update_api_info,
inputs=[model_dropdown],
outputs=[api_input]
)
submit_btn.click(
fn=generate_html,
inputs=[model_dropdown, api_input, text_input, url_input, file_input],
outputs=[html_file_output, status_output, gr.State()]
).then(
fn=lambda file, status, _: (
gr.update(visible=file is not None),
status,
gr.update(visible=file is not None, value=(open(file, 'r', encoding='utf-8').read() if file else ""))
),
inputs=[html_file_output, status_output, gr.State()],
outputs=[html_file_output, status_output, html_preview]
)
reset_btn.click(
fn=reset_form,
outputs=[model_dropdown, api_input, text_input, url_input, file_input, status_output, html_file_output, html_preview]
)
if __name__ == "__main__":
app.launch(
server_name="0.0.0.0",
server_port=7860,
share=True
)