Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import torch | |
import fitz # PyMuPDF | |
from docx import Document | |
# Load model and tokenizer | |
model_name = "microsoft/phi-2" | |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16) | |
def extract_text_from_pdf(file): | |
doc = fitz.open(stream=file.read(), filetype="pdf") | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
return text | |
def extract_text_from_docx(file): | |
doc = Document(file) | |
return "\n".join([paragraph.text for paragraph in doc.paragraphs]) | |
def convert_to_story(file): | |
if file is None: | |
return "Please upload a file." | |
file_extension = file.name.split('.')[-1].lower() | |
if file_extension == 'pdf': | |
text = extract_text_from_pdf(file) | |
elif file_extension == 'docx': | |
text = extract_text_from_docx(file) | |
else: | |
return "Unsupported file format. Please upload a PDF or DOCX file." | |
prompt = f"Convert the following news article into a short children's story (maximum 200 words):\n\n{text}\n\nChildren's story:" | |
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024) | |
with torch.no_grad(): | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=200, | |
temperature=0.7, | |
top_p=0.95, | |
do_sample=True | |
) | |
story = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return story.split("Children's story:")[-1].strip() | |
iface = gr.Interface( | |
fn=convert_to_story, | |
inputs=gr.File(label="Upload PDF or DOCX file"), | |
outputs="text", | |
title="News to Children's Story Converter", | |
description="Upload a news article in PDF or DOCX format to convert it into a short children's story." | |
) | |
iface.launch() |