Spaces:

CamiloVega
/

cuentos

Sleeping

File size: 1,941 Bytes

import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import fitz  # PyMuPDF
from docx import Document

# Load model and tokenizer
model_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16)

def extract_text_from_pdf(file):
    doc = fitz.open(stream=file.read(), filetype="pdf")
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def extract_text_from_docx(file):
    doc = Document(file)
    return "\n".join([paragraph.text for paragraph in doc.paragraphs])

def convert_to_story(file):
    if file is None:
        return "Please upload a file."
    
    file_extension = file.name.split('.')[-1].lower()
    
    if file_extension == 'pdf':
        text = extract_text_from_pdf(file)
    elif file_extension == 'docx':
        text = extract_text_from_docx(file)
    else:
        return "Unsupported file format. Please upload a PDF or DOCX file."

    prompt = f"Convert the following news article into a short children's story (maximum 200 words):\n\n{text}\n\nChildren's story:"
    
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            top_p=0.95,
            do_sample=True
        )
    
    story = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return story.split("Children's story:")[-1].strip()

iface = gr.Interface(
    fn=convert_to_story,
    inputs=gr.File(label="Upload PDF or DOCX file"),
    outputs="text",
    title="News to Children's Story Converter",
    description="Upload a news article in PDF or DOCX format to convert it into a short children's story."
)

iface.launch()