Spaces:

pierreguillou
/

eduhtml-creator-apple-style

Sleeping

File size: 13,974 Bytes

import gradio as gr
import os
import time
import requests
from datetime import datetime
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage
from langchain_core.caches import BaseCache
from langchain_core.callbacks import Callbacks
ChatGoogleGenerativeAI.model_rebuild()
import pandas as pd
import io
import tempfile
from urllib.parse import urlparse
import re

# Import DocLing and necessary configuration classes
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat

# Import and rebuild ChatGoogleGenerativeAI deferred
try:
    from langchain_google_genai import ChatGoogleGenerativeAI
    from langchain_core.caches import BaseCache
    ChatGoogleGenerativeAI.model_rebuild()
except Exception as e:
    print(f"Warning during rebuild: {e}")
    from langchain_google_genai import ChatGoogleGenerativeAI

# --- START OF OCR CONFIGURATION ---
# Create a single, pre-configured DocumentConverter instance to be reused.
# This is more efficient than creating it on every function call.

# 1. Define the pipeline options to enable OCR for PDFs.
# Configure a single global DocLing converter with Tesseract OCR enabled and all languages
# Note: With tesseract-ocr-all installed, all language data files are available.
pdf_options = PdfPipelineOptions(
    do_ocr=True,
    ocr_model="tesseract",
    # Provide a broad default set. With tesseract-ocr-all, many language packs exist.
    # You can keep this small for speed or expand it. Here we include a practical wide set.
    ocr_languages=[
        "eng","fra","deu","spa","ita","por","nld","pol","tur","ces","rus","ukr","ell","ron","hun",
        "bul","hrv","srp","slk","slv","lit","lav","est","cat","eus","glg","isl","dan","nor","swe",
        "fin","alb","mlt","afr","zul","swa","amh","uzb","aze","kaz","kir","mon","tgl","ind","msa",
        "tha","vie","khm","lao","mya","ben","hin","mar","guj","pan","mal","tam","tel","kan","nep",
        "sin","urd","fas","pus","kur","aze_cyrl","tat","uig","heb","ara","yid","grc","chr","epo",
        "hye","kat","kat_old","aze_latn","mkd","bel","srp_latn","srp_cyrillic",
        # CJK — these are heavier and slower; include only if needed:
        "chi_sim","chi_tra","jpn","kor"
    ]
)

# 2. Create the format-specific configuration.
format_options = {
    InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)
}

# 3. Initialize the converter with the OCR configuration.
# This converter will now automatically perform OCR on any PDF file.
docling_converter = DocumentConverter(format_options=format_options)
# --- END OF OCR CONFIGURATION ---

# Model configuration
MODELS = {
    "Gemini 2.5 Flash (Google AI)": {
        "provider": "Google AI",
        "class": ChatGoogleGenerativeAI,
        "model_name": "gemini-2.0-flash-exp",
        "default_api": True
    },
    "ChatGPT 5 (OpenAI)": {
        "provider": "OpenAI",
        "class": ChatOpenAI,
        "model_name": "gpt-4o",
        "default_api": False
    },
    "Claude Sonnet 4 (Anthropic)": {
        "provider": "Anthropic",
        "class": ChatAnthropic,
        "model_name": "claude-3-5-sonnet-20241022",
        "default_api": False
    },
    "Gemini 2.5 Pro (Google AI)": {
        "provider": "Google AI",
        "class": ChatGoogleGenerativeAI,
        "model_name": "gemini-2.0-flash-exp",
        "default_api": False
    }
}

# Default API for Gemini 2.5 Flash via HF Spaces Secrets
DEFAULT_GEMINI_API = os.getenv("FLASH_GOOGLE_API_KEY")

def extract_text_from_file(file):
    """
    Extract text from an uploaded file or path (str).
    - Accepts an object with .name attribute (e.g. Gradio upload) OR a file path (str).
    - DocLing for: .pdf (Tesseract OCR enabled if configured), .docx, .xlsx, .pptx
    - Converts .csv /.xls -> temporary .xlsx then DocLing
    - .txt read directly
    """
    if file is None:
        return ""

    # Normalize to a filesystem path string
    path = file.name if hasattr(file, "name") else str(file)
    ext = os.path.splitext(path)[1].lower()

    docling_direct = {".pdf", ".docx", ".xlsx", ".pptx"}
    to_xlsx_first = {".csv", ".xls"}

    try:
        if ext in docling_direct:
            result = docling_converter.convert(path)
            return result.document.export_to_markdown()

        elif ext in to_xlsx_first:
            # Convert CSV/XLS -> XLSX
            if ext == ".csv":
                df = pd.read_csv(path)
            else:  # .xls
                df = pd.read_excel(path)

            with tempfile.NamedTemporaryFile(delete=True, suffix=".xlsx") as tmp:
                df.to_excel(tmp.name, index=False)
                result = docling_converter.convert(tmp.name)
                return result.document.export_to_markdown()

        elif ext == ".txt":
            with open(path, "r", encoding="utf-8") as f:
                return f.read()

        else:
            return "Unsupported file format"
    except Exception as e:
        return f"Error reading file: {str(e)}"

def extract_text_from_url(url):
    """Extract text from a URL"""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        content = response.text
        content = re.sub(r'<[^>]+>', '', content)
        content = re.sub(r'\s+', ' ', content).strip()
        return content[:10000]  # Limit to 10k characters
    except Exception as e:
        return f"Error retrieving URL: {str(e)}"

def get_document_content(text_input, url_input, file_input):
    """Retrieve document content based on source"""
    if text_input.strip():
        return text_input.strip()
    elif url_input.strip():
        return extract_text_from_url(url_input.strip())
    elif file_input is not None:
        return extract_text_from_file(file_input)
    else:
        return ""

def create_llm_instance(model_name, api_key):
    """Create an LLM model instance"""
    model_config = MODELS[model_name]
    if model_config["provider"] == "OpenAI":
        return model_config["class"](
            model=model_config["model_name"],
            api_key=api_key,
            temperature=0.7
        )
    elif model_config["provider"] == "Anthropic":
        return model_config["class"](
            model=model_config["model_name"],
            api_key=api_key,
            temperature=0.7
        )
    elif model_config["provider"] == "Google AI":
        api_to_use = api_key if api_key else DEFAULT_GEMINI_API
        return model_config["class"](
            model=model_config["model_name"],
            google_api_key=api_to_use,
            temperature=0.7
        )

def generate_html(model_name, api_key, text_input, url_input, file_input):
    """Generate educational HTML file"""
    start_time = time.time()
    if model_name != "Gemini 2.5 Flash (Google AI)" and not api_key.strip():
        return None, "❌ Error: Please provide an API key for this model.", 0

    document_content = get_document_content(text_input, url_input, file_input)
    if not document_content:
        return None, "❌ Error: Please provide a document (text, URL or file).", 0

    try:
        # Create LLM instance
        llm = create_llm_instance(model_name, api_key)

        # Read prompt template
        with open("creation_educational_html_from_any_document_18082025.txt", "r", encoding="utf-8") as f:
            prompt_template = f.read()

        # Replace variables
        model_config = MODELS[model_name]
        prompt = prompt_template.format(
            model_name=model_config["model_name"],
            provider_name=model_config["provider"],
            document=document_content
        )

        # Generate content
        message = HumanMessage(content=prompt)
        response = llm.invoke([message])
        html_content = response.content

        # Clean any code tags from models
        html_content = html_content.replace("```html", "")
        html_content = html_content.replace("```", "")

        # Calculate generation time
        generation_time = time.time() - start_time

        # Save HTML file
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"educational_document_{timestamp}.html"
        with open(filename, "w", encoding="utf-8") as f:
            f.write(html_content)

        success_message = f"✅ HTML file generated successfully in {generation_time:.2f} seconds!"
        return filename, success_message, generation_time

    except Exception as e:
        error_message = f"❌ Error during generation: {str(e)}"
        return None, error_message, 0

def reset_form():
    """Reset the form to zero"""
    return (
        "Gemini 2.5 Flash (Google AI)",  # model_name
        "",  # api_key
        "",  # text_input
        "",  # url_input
        None,  # file_input
        "",  # status_message
        None,  # html_file
        ""   # html_preview
    )

def update_api_info(model_name):
    """Update API information based on selected model"""
    if model_name == "Gemini 2.5 Flash (Google AI)":
        return gr.update(
            label="API Key (optional)",
            placeholder="Free API available until exhausted, or use your own key",
            info="💡 A free API is already configured for this model. You can use your own key if you wish."
        )
    else:
        return gr.update(
            label="API Key (required)",
            placeholder="Enter your API key",
            info="🔑 API key required for this model"
        )

# Gradio Interface (Apple-like)
with gr.Blocks(
    title="EduHTML Creator - Educational HTML Content Generator",
    theme=gr.themes.Soft(),
    css="style.css",
    js="script.js"
) as app:

    # Header hero (black, full-width look within container)
    gr.HTML("""
    <div class="header" role="banner">
        <div class="header-inner">
            <h1>🎓 EduHTML Creator</h1>
            <p>
                Transform any document into interactive educational HTML content, with a premium Apple-inspired design.
                Document fidelity, clear structure, interactivity, and highlighting of key information.
            </p>
        </div>
    </div>
    """)

    with gr.Column(elem_classes=["main-container"]):
        # Model Configuration Section
        gr.HTML("<div class='section'>")
        model_dropdown = gr.Dropdown(
            choices=list(MODELS.keys()),
            value="Gemini 2.5 Flash (Google AI)",
            label="LLM Model",
            info="Select the model to use for generation"
        )

        api_input = gr.Textbox(
            label="API Key (optional)",
            placeholder="Free API (Gemini Flash) available. You can enter your own key.",
            info="For OpenAI/Anthropic, a key is required.",
            type="password"
        )
        gr.HTML("</div>")

        # Document Source Section with tabs
        gr.HTML("<div class='section alt'>")
        gr.HTML("<h3>Document Source</h3>")
        
        with gr.Tabs():
            with gr.TabItem("📝 Text"):
                text_input = gr.Textbox(
                    label="Copied/pasted text",
                    placeholder="Paste your text here...",
                    lines=4
                )
            
            with gr.TabItem("🌐 URL"):
                url_input = gr.Textbox(
                    label="Web Link",
                    placeholder="https://example.com/article"
                )
            
            with gr.TabItem("📁 File"):
                file_input = gr.File(
                    label="File",
                    file_types=[".pdf", ".txt", ".docx", ".xlsx", ".xls", ".pptx"]
                )
        
        gr.HTML("</div>")

        # Action buttons
        with gr.Row():
            submit_btn = gr.Button("Generate HTML", variant="primary", elem_classes=["apple-button"])
            reset_btn = gr.Button("Reset", elem_classes=["reset-button"])

        # Results Section
        status_output = gr.HTML(label="Status")
        gr.HTML("<div class='section preview-card'>")
        gr.HTML("<div class='preview-header'><div class='preview-dot' aria-hidden='true'></div><div>Preview</div></div>")
        html_preview = gr.HTML(label="Preview", visible=False, elem_id="html-preview", elem_classes=["preview-body"])
        html_file_output = gr.File(label="Downloadable HTML file", visible=False)
        gr.HTML("</div>")

        # Footer (black)
        gr.HTML("""
        <div class="footer" role="contentinfo">
            <div class="footer-inner">
                <span>Apple-inspired design • High contrasts • Smooth interactions</span>
            </div>
        </div>
        """)

        # Events
        model_dropdown.change(
            fn=update_api_info,
            inputs=[model_dropdown],
            outputs=[api_input]
        )

        submit_btn.click(
            fn=generate_html,
            inputs=[model_dropdown, api_input, text_input, url_input, file_input],
            outputs=[html_file_output, status_output, gr.State()]
        ).then(
            fn=lambda file, status, _: (
                gr.update(visible=file is not None),
                status,
                gr.update(visible=file is not None, value=(open(file, 'r', encoding='utf-8').read() if file else ""))
            ),
            inputs=[html_file_output, status_output, gr.State()],
            outputs=[html_file_output, status_output, html_preview]
        )

        reset_btn.click(
            fn=reset_form,
            outputs=[model_dropdown, api_input, text_input, url_input, file_input, status_output, html_file_output, html_preview]
        )

if __name__ == "__main__":
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True
    )