Spaces:
Runtime error
Runtime error
File size: 3,164 Bytes
c2386f3 d15716c 6156752 226d8dd c2386f3 51afa48 f4aeea8 51afa48 6156752 51afa48 6156752 51afa48 c2386f3 51afa48 c2386f3 51afa48 c2386f3 51afa48 c2386f3 51afa48 6156752 2ef22e2 226d8dd 2ef22e2 226d8dd 2ef22e2 39c36c8 6156752 226d8dd 2ef22e2 226d8dd b423cba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import logging
from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
from pdfminer.high_level import extract_text
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
pipe = pipeline("text2text-generation", model="google-t5/t5-base", device="cpu")
pipe.model.config.pad_token_id = pipe.tokenizer.eos_token_id
fine_tuned_model_path = "./fine_tuned_model"
fine_tuned_model = T5ForConditionalGeneration.from_pretrained(fine_tuned_model_path)
fine_tuned_tokenizer = T5Tokenizer.from_pretrained(fine_tuned_model_path)
def pdf_to_text(pdf_path):
try:
logger.info(f"Extracting text from PDF: {pdf_path}")
return extract_text(pdf_path)
except Exception as e:
logger.error(f"Error while extracting text from PDF: {str(e)}")
raise ValueError(f"PDF extraction error: {str(e)}")
def generate_lesson_from_transcript(doc_text):
try:
logger.info("Generating lesson from transcript using general model.")
generated_text = pipe(doc_text, max_length=100, truncation=True)[0]['generated_text']
output_path = "/tmp/generated_output.txt"
with open(output_path, "w") as file:
file.write(generated_text)
logger.info(f"Lesson generation successful. Output saved at: {output_path}")
return generated_text, output_path
except Exception as e:
logger.error(f"Error occurred during lesson generation: {str(e)}")
return "An error occurred", None
def refine_with_fine_tuned_model(general_output):
try:
logger.info("Refining the output with fine-tuned model.")
prompt = "Refine the following text for teaching purposes: " + general_output
inputs = fine_tuned_tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
output_ids = fine_tuned_model.generate(
inputs["input_ids"],
max_length=300,
no_repeat_ngram_size=3,
early_stopping=True
)
refined_text = fine_tuned_tokenizer.decode(output_ids[0], skip_special_tokens=True)
return refined_text
except Exception as e:
logger.error(f"Error during refinement with fine-tuned model: {str(e)}")
return "An error occurred during refinement."
def split_text_into_chunks(text, chunk_size=1000):
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size):
chunk = ' '.join(words[i:i+chunk_size])
chunks.append(chunk)
return chunks
def generate_lesson_from_chunks(chunks):
generated_texts = []
for chunk in chunks:
try:
generated_text = pipe(chunk, max_length=500, truncation=True)[0]['generated_text']
generated_texts.append(generated_text)
except Exception as e:
print(f"Error in chunk processing: {str(e)}")
continue
return ' '.join(generated_texts)
def process_large_text(text):
chunks = split_text_into_chunks(text, chunk_size=1000)
generated_text = generate_lesson_from_chunks(chunks)
return generated_text |