File size: 3,164 Bytes
c2386f3
d15716c
6156752
 
226d8dd
c2386f3
 
51afa48
f4aeea8
51afa48
 
6156752
 
 
 
 
 
 
 
 
 
 
 
 
51afa48
 
6156752
51afa48
c2386f3
51afa48
 
 
 
c2386f3
51afa48
c2386f3
51afa48
c2386f3
 
51afa48
6156752
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2ef22e2
226d8dd
2ef22e2
 
226d8dd
2ef22e2
 
 
 
 
 
39c36c8
 
 
 
 
6156752
226d8dd
2ef22e2
 
226d8dd
 
b423cba
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import logging
from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
from pdfminer.high_level import extract_text


logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

pipe = pipeline("text2text-generation", model="google-t5/t5-base", device="cpu")
pipe.model.config.pad_token_id = pipe.tokenizer.eos_token_id 

fine_tuned_model_path = "./fine_tuned_model"
fine_tuned_model = T5ForConditionalGeneration.from_pretrained(fine_tuned_model_path)
fine_tuned_tokenizer = T5Tokenizer.from_pretrained(fine_tuned_model_path)


def pdf_to_text(pdf_path):
    try:
        logger.info(f"Extracting text from PDF: {pdf_path}")
        return extract_text(pdf_path)
    except Exception as e:
        logger.error(f"Error while extracting text from PDF: {str(e)}")
        raise ValueError(f"PDF extraction error: {str(e)}")

def generate_lesson_from_transcript(doc_text):
    try:
        logger.info("Generating lesson from transcript using general model.")
        generated_text = pipe(doc_text, max_length=100, truncation=True)[0]['generated_text']
        output_path = "/tmp/generated_output.txt"
        
        with open(output_path, "w") as file:
            file.write(generated_text)
        
        logger.info(f"Lesson generation successful. Output saved at: {output_path}")
        return generated_text, output_path

    except Exception as e:
        logger.error(f"Error occurred during lesson generation: {str(e)}")
        return "An error occurred", None

def refine_with_fine_tuned_model(general_output):
    try:
        logger.info("Refining the output with fine-tuned model.")
        prompt = "Refine the following text for teaching purposes: " + general_output
        inputs = fine_tuned_tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
        output_ids = fine_tuned_model.generate(
            inputs["input_ids"],
            max_length=300,
            no_repeat_ngram_size=3,
            early_stopping=True
        )
        refined_text = fine_tuned_tokenizer.decode(output_ids[0], skip_special_tokens=True)
        return refined_text
    except Exception as e:
        logger.error(f"Error during refinement with fine-tuned model: {str(e)}")
        return "An error occurred during refinement."

def split_text_into_chunks(text, chunk_size=1000):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = ' '.join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

def generate_lesson_from_chunks(chunks):
    generated_texts = []
    for chunk in chunks:
        try:
            generated_text = pipe(chunk, max_length=500, truncation=True)[0]['generated_text']
            generated_texts.append(generated_text)
        except Exception as e:
            print(f"Error in chunk processing: {str(e)}")
            continue
    return ' '.join(generated_texts)

def process_large_text(text):
    chunks = split_text_into_chunks(text, chunk_size=1000)
    generated_text = generate_lesson_from_chunks(chunks)
    return generated_text