|
|
import gradio as gr |
|
|
import os |
|
|
import time |
|
|
import requests |
|
|
from datetime import datetime |
|
|
from langchain_openai import ChatOpenAI |
|
|
from langchain_anthropic import ChatAnthropic |
|
|
from langchain_google_genai import ChatGoogleGenerativeAI |
|
|
from langchain_core.messages import HumanMessage |
|
|
from langchain_core.caches import BaseCache |
|
|
from langchain_core.callbacks import Callbacks |
|
|
ChatGoogleGenerativeAI.model_rebuild() |
|
|
import pandas as pd |
|
|
import io |
|
|
import tempfile |
|
|
from urllib.parse import urlparse |
|
|
import re |
|
|
|
|
|
|
|
|
from docling.document_converter import DocumentConverter, PdfFormatOption |
|
|
from docling.datamodel.pipeline_options import PdfPipelineOptions |
|
|
from docling.datamodel.base_models import InputFormat |
|
|
|
|
|
|
|
|
try: |
|
|
from langchain_google_genai import ChatGoogleGenerativeAI |
|
|
from langchain_core.caches import BaseCache |
|
|
ChatGoogleGenerativeAI.model_rebuild() |
|
|
except Exception as e: |
|
|
print(f"Warning during rebuild: {e}") |
|
|
from langchain_google_genai import ChatGoogleGenerativeAI |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pdf_options = PdfPipelineOptions( |
|
|
do_ocr=True, |
|
|
ocr_model="tesseract", |
|
|
|
|
|
|
|
|
ocr_languages=[ |
|
|
"eng","fra","deu","spa","ita","por","nld","pol","tur","ces","rus","ukr","ell","ron","hun", |
|
|
"bul","hrv","srp","slk","slv","lit","lav","est","cat","eus","glg","isl","dan","nor","swe", |
|
|
"fin","alb","mlt","afr","zul","swa","amh","uzb","aze","kaz","kir","mon","tgl","ind","msa", |
|
|
"tha","vie","khm","lao","mya","ben","hin","mar","guj","pan","mal","tam","tel","kan","nep", |
|
|
"sin","urd","fas","pus","kur","aze_cyrl","tat","uig","heb","ara","yid","grc","chr","epo", |
|
|
"hye","kat","kat_old","aze_latn","mkd","bel","srp_latn","srp_cyrillic", |
|
|
|
|
|
"chi_sim","chi_tra","jpn","kor" |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
format_options = { |
|
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options) |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
docling_converter = DocumentConverter(format_options=format_options) |
|
|
|
|
|
|
|
|
|
|
|
MODELS = { |
|
|
"Gemini 2.5 Flash (Google AI)": { |
|
|
"provider": "Google AI", |
|
|
"class": ChatGoogleGenerativeAI, |
|
|
"model_name": "gemini-2.0-flash-exp", |
|
|
"default_api": True |
|
|
}, |
|
|
"ChatGPT 5 (OpenAI)": { |
|
|
"provider": "OpenAI", |
|
|
"class": ChatOpenAI, |
|
|
"model_name": "gpt-4o", |
|
|
"default_api": False |
|
|
}, |
|
|
"Claude Sonnet 4 (Anthropic)": { |
|
|
"provider": "Anthropic", |
|
|
"class": ChatAnthropic, |
|
|
"model_name": "claude-3-5-sonnet-20241022", |
|
|
"default_api": False |
|
|
}, |
|
|
"Gemini 2.5 Pro (Google AI)": { |
|
|
"provider": "Google AI", |
|
|
"class": ChatGoogleGenerativeAI, |
|
|
"model_name": "gemini-2.0-flash-exp", |
|
|
"default_api": False |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
DEFAULT_GEMINI_API = os.getenv("FLASH_GOOGLE_API_KEY") |
|
|
|
|
|
def extract_text_from_file(file): |
|
|
""" |
|
|
Extract text from an uploaded file or path (str). |
|
|
- Accepts an object with .name attribute (e.g. Gradio upload) OR a file path (str). |
|
|
- DocLing for: .pdf (Tesseract OCR enabled if configured), .docx, .xlsx, .pptx |
|
|
- Converts .csv /.xls -> temporary .xlsx then DocLing |
|
|
- .txt read directly |
|
|
""" |
|
|
if file is None: |
|
|
return "" |
|
|
|
|
|
|
|
|
path = file.name if hasattr(file, "name") else str(file) |
|
|
ext = os.path.splitext(path)[1].lower() |
|
|
|
|
|
docling_direct = {".pdf", ".docx", ".xlsx", ".pptx"} |
|
|
to_xlsx_first = {".csv", ".xls"} |
|
|
|
|
|
try: |
|
|
if ext in docling_direct: |
|
|
result = docling_converter.convert(path) |
|
|
return result.document.export_to_markdown() |
|
|
|
|
|
elif ext in to_xlsx_first: |
|
|
|
|
|
if ext == ".csv": |
|
|
df = pd.read_csv(path) |
|
|
else: |
|
|
df = pd.read_excel(path) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=True, suffix=".xlsx") as tmp: |
|
|
df.to_excel(tmp.name, index=False) |
|
|
result = docling_converter.convert(tmp.name) |
|
|
return result.document.export_to_markdown() |
|
|
|
|
|
elif ext == ".txt": |
|
|
with open(path, "r", encoding="utf-8") as f: |
|
|
return f.read() |
|
|
|
|
|
else: |
|
|
return "Unsupported file format" |
|
|
except Exception as e: |
|
|
return f"Error reading file: {str(e)}" |
|
|
|
|
|
def extract_text_from_url(url): |
|
|
"""Extract text from a URL""" |
|
|
try: |
|
|
response = requests.get(url, timeout=10) |
|
|
response.raise_for_status() |
|
|
content = response.text |
|
|
content = re.sub(r'<[^>]+>', '', content) |
|
|
content = re.sub(r'\s+', ' ', content).strip() |
|
|
return content[:10000] |
|
|
except Exception as e: |
|
|
return f"Error retrieving URL: {str(e)}" |
|
|
|
|
|
def get_document_content(text_input, url_input, file_input): |
|
|
"""Retrieve document content based on source""" |
|
|
if text_input.strip(): |
|
|
return text_input.strip() |
|
|
elif url_input.strip(): |
|
|
return extract_text_from_url(url_input.strip()) |
|
|
elif file_input is not None: |
|
|
return extract_text_from_file(file_input) |
|
|
else: |
|
|
return "" |
|
|
|
|
|
def create_llm_instance(model_name, api_key): |
|
|
"""Create an LLM model instance""" |
|
|
model_config = MODELS[model_name] |
|
|
if model_config["provider"] == "OpenAI": |
|
|
return model_config["class"]( |
|
|
model=model_config["model_name"], |
|
|
api_key=api_key, |
|
|
temperature=0.7 |
|
|
) |
|
|
elif model_config["provider"] == "Anthropic": |
|
|
return model_config["class"]( |
|
|
model=model_config["model_name"], |
|
|
api_key=api_key, |
|
|
temperature=0.7 |
|
|
) |
|
|
elif model_config["provider"] == "Google AI": |
|
|
api_to_use = api_key if api_key else DEFAULT_GEMINI_API |
|
|
return model_config["class"]( |
|
|
model=model_config["model_name"], |
|
|
google_api_key=api_to_use, |
|
|
temperature=0.7 |
|
|
) |
|
|
|
|
|
def generate_html(model_name, api_key, text_input, url_input, file_input): |
|
|
"""Generate educational HTML file""" |
|
|
start_time = time.time() |
|
|
if model_name != "Gemini 2.5 Flash (Google AI)" and not api_key.strip(): |
|
|
return None, "❌ Error: Please provide an API key for this model.", 0 |
|
|
|
|
|
document_content = get_document_content(text_input, url_input, file_input) |
|
|
if not document_content: |
|
|
return None, "❌ Error: Please provide a document (text, URL or file).", 0 |
|
|
|
|
|
try: |
|
|
|
|
|
llm = create_llm_instance(model_name, api_key) |
|
|
|
|
|
|
|
|
with open("creation_educational_html_from_any_document_18082025.txt", "r", encoding="utf-8") as f: |
|
|
prompt_template = f.read() |
|
|
|
|
|
|
|
|
model_config = MODELS[model_name] |
|
|
prompt = prompt_template.format( |
|
|
model_name=model_config["model_name"], |
|
|
provider_name=model_config["provider"], |
|
|
document=document_content |
|
|
) |
|
|
|
|
|
|
|
|
message = HumanMessage(content=prompt) |
|
|
response = llm.invoke([message]) |
|
|
html_content = response.content |
|
|
|
|
|
|
|
|
html_content = html_content.replace("```html", "") |
|
|
html_content = html_content.replace("```", "") |
|
|
|
|
|
|
|
|
generation_time = time.time() - start_time |
|
|
|
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
filename = f"educational_document_{timestamp}.html" |
|
|
with open(filename, "w", encoding="utf-8") as f: |
|
|
f.write(html_content) |
|
|
|
|
|
success_message = f"✅ HTML file generated successfully in {generation_time:.2f} seconds!" |
|
|
return filename, success_message, generation_time |
|
|
|
|
|
except Exception as e: |
|
|
error_message = f"❌ Error during generation: {str(e)}" |
|
|
return None, error_message, 0 |
|
|
|
|
|
def reset_form(): |
|
|
"""Reset the form to zero""" |
|
|
return ( |
|
|
"Gemini 2.5 Flash (Google AI)", |
|
|
"", |
|
|
"", |
|
|
"", |
|
|
None, |
|
|
"", |
|
|
None, |
|
|
"" |
|
|
) |
|
|
|
|
|
def update_api_info(model_name): |
|
|
"""Update API information based on selected model""" |
|
|
if model_name == "Gemini 2.5 Flash (Google AI)": |
|
|
return gr.update( |
|
|
label="API Key (optional)", |
|
|
placeholder="Free API available until exhausted, or use your own key", |
|
|
info="💡 A free API is already configured for this model. You can use your own key if you wish." |
|
|
) |
|
|
else: |
|
|
return gr.update( |
|
|
label="API Key (required)", |
|
|
placeholder="Enter your API key", |
|
|
info="🔑 API key required for this model" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Blocks( |
|
|
title="EduHTML Creator - Educational HTML Content Generator", |
|
|
theme=gr.themes.Soft(), |
|
|
css="style.css", |
|
|
js="script.js" |
|
|
) as app: |
|
|
|
|
|
|
|
|
gr.HTML(""" |
|
|
<div class="header" role="banner"> |
|
|
<div class="header-inner"> |
|
|
<h1>🎓 EduHTML Creator</h1> |
|
|
<p> |
|
|
Transform any document into interactive educational HTML content, with a premium Apple-inspired design. |
|
|
Document fidelity, clear structure, interactivity, and highlighting of key information. |
|
|
</p> |
|
|
</div> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
with gr.Column(elem_classes=["main-container"]): |
|
|
|
|
|
gr.HTML("<div class='section'>") |
|
|
model_dropdown = gr.Dropdown( |
|
|
choices=list(MODELS.keys()), |
|
|
value="Gemini 2.5 Flash (Google AI)", |
|
|
label="LLM Model", |
|
|
info="Select the model to use for generation" |
|
|
) |
|
|
|
|
|
api_input = gr.Textbox( |
|
|
label="API Key (optional)", |
|
|
placeholder="Free API (Gemini Flash) available. You can enter your own key.", |
|
|
info="For OpenAI/Anthropic, a key is required.", |
|
|
type="password" |
|
|
) |
|
|
gr.HTML("</div>") |
|
|
|
|
|
|
|
|
gr.HTML("<div class='section alt'>") |
|
|
gr.HTML("<h3>Document Source</h3>") |
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.TabItem("📝 Text"): |
|
|
text_input = gr.Textbox( |
|
|
label="Copied/pasted text", |
|
|
placeholder="Paste your text here...", |
|
|
lines=4 |
|
|
) |
|
|
|
|
|
with gr.TabItem("🌐 URL"): |
|
|
url_input = gr.Textbox( |
|
|
label="Web Link", |
|
|
placeholder="https://example.com/article" |
|
|
) |
|
|
|
|
|
with gr.TabItem("📁 File"): |
|
|
file_input = gr.File( |
|
|
label="File", |
|
|
file_types=[".pdf", ".txt", ".docx", ".xlsx", ".xls", ".pptx"] |
|
|
) |
|
|
|
|
|
gr.HTML("</div>") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
submit_btn = gr.Button("Generate HTML", variant="primary", elem_classes=["apple-button"]) |
|
|
reset_btn = gr.Button("Reset", elem_classes=["reset-button"]) |
|
|
|
|
|
|
|
|
status_output = gr.HTML(label="Status") |
|
|
gr.HTML("<div class='section preview-card'>") |
|
|
gr.HTML("<div class='preview-header'><div class='preview-dot' aria-hidden='true'></div><div>Preview</div></div>") |
|
|
html_preview = gr.HTML(label="Preview", visible=False, elem_id="html-preview", elem_classes=["preview-body"]) |
|
|
html_file_output = gr.File(label="Downloadable HTML file", visible=False) |
|
|
gr.HTML("</div>") |
|
|
|
|
|
|
|
|
gr.HTML(""" |
|
|
<div class="footer" role="contentinfo"> |
|
|
<div class="footer-inner"> |
|
|
<span>Apple-inspired design • High contrasts • Smooth interactions</span> |
|
|
</div> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
|
|
|
model_dropdown.change( |
|
|
fn=update_api_info, |
|
|
inputs=[model_dropdown], |
|
|
outputs=[api_input] |
|
|
) |
|
|
|
|
|
submit_btn.click( |
|
|
fn=generate_html, |
|
|
inputs=[model_dropdown, api_input, text_input, url_input, file_input], |
|
|
outputs=[html_file_output, status_output, gr.State()] |
|
|
).then( |
|
|
fn=lambda file, status, _: ( |
|
|
gr.update(visible=file is not None), |
|
|
status, |
|
|
gr.update(visible=file is not None, value=(open(file, 'r', encoding='utf-8').read() if file else "")) |
|
|
), |
|
|
inputs=[html_file_output, status_output, gr.State()], |
|
|
outputs=[html_file_output, status_output, html_preview] |
|
|
) |
|
|
|
|
|
reset_btn.click( |
|
|
fn=reset_form, |
|
|
outputs=[model_dropdown, api_input, text_input, url_input, file_input, status_output, html_file_output, html_preview] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
app.launch( |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
share=True |
|
|
) |