import logging from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration from pdfminer.high_level import extract_text logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) pipe = pipeline("text2text-generation", model="google-t5/t5-base", device="cpu") pipe.model.config.pad_token_id = pipe.tokenizer.eos_token_id fine_tuned_model_path = "./fine_tuned_model" fine_tuned_model = T5ForConditionalGeneration.from_pretrained(fine_tuned_model_path) fine_tuned_tokenizer = T5Tokenizer.from_pretrained(fine_tuned_model_path) def pdf_to_text(pdf_path): try: logger.info(f"Extracting text from PDF: {pdf_path}") return extract_text(pdf_path) except Exception as e: logger.error(f"Error while extracting text from PDF: {str(e)}") raise ValueError(f"PDF extraction error: {str(e)}") def generate_lesson_from_transcript(doc_text): try: logger.info("Generating lesson from transcript using general model.") generated_text = pipe(doc_text, max_length=100, truncation=True)[0]['generated_text'] output_path = "/tmp/generated_output.txt" with open(output_path, "w") as file: file.write(generated_text) logger.info(f"Lesson generation successful. Output saved at: {output_path}") return generated_text, output_path except Exception as e: logger.error(f"Error occurred during lesson generation: {str(e)}") return "An error occurred", None def refine_with_fine_tuned_model(general_output): try: logger.info("Refining the output with fine-tuned model.") prompt = "Refine the following text for teaching purposes: " + general_output inputs = fine_tuned_tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512) output_ids = fine_tuned_model.generate( inputs["input_ids"], max_length=300, no_repeat_ngram_size=3, early_stopping=True ) refined_text = fine_tuned_tokenizer.decode(output_ids[0], skip_special_tokens=True) return refined_text except Exception as e: logger.error(f"Error during refinement with fine-tuned model: {str(e)}") return "An error occurred during refinement." def split_text_into_chunks(text, chunk_size=1000): words = text.split() chunks = [] for i in range(0, len(words), chunk_size): chunk = ' '.join(words[i:i+chunk_size]) chunks.append(chunk) return chunks def generate_lesson_from_chunks(chunks): generated_texts = [] for chunk in chunks: try: generated_text = pipe(chunk, max_length=500, truncation=True)[0]['generated_text'] generated_texts.append(generated_text) except Exception as e: print(f"Error in chunk processing: {str(e)}") continue return ' '.join(generated_texts) def process_large_text(text): chunks = split_text_into_chunks(text, chunk_size=1000) generated_text = generate_lesson_from_chunks(chunks) return generated_text