GurgenGulay commited on
Commit
6156752
·
verified ·
1 Parent(s): d15716c

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +34 -3
utils.py CHANGED
@@ -1,16 +1,30 @@
1
  import logging
2
  from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
 
 
3
 
4
- # Logging Ayarları
5
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
6
  logger = logging.getLogger(__name__)
7
 
8
  pipe = pipeline("text2text-generation", model="google-t5/t5-base", device="cpu")
9
  pipe.model.config.pad_token_id = pipe.tokenizer.eos_token_id
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def generate_lesson_from_transcript(doc_text):
12
  try:
13
- logger.info("Generating lesson from transcript.")
14
  generated_text = pipe(doc_text, max_length=100, truncation=True)[0]['generated_text']
15
  output_path = "/tmp/generated_output.txt"
16
 
@@ -24,6 +38,23 @@ def generate_lesson_from_transcript(doc_text):
24
  logger.error(f"Error occurred during lesson generation: {str(e)}")
25
  return "An error occurred", None
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def split_text_into_chunks(text, chunk_size=1000):
28
  words = text.split()
29
  chunks = []
@@ -40,7 +71,7 @@ def generate_lesson_from_chunks(chunks):
40
  generated_texts.append(generated_text)
41
  except Exception as e:
42
  print(f"Error in chunk processing: {str(e)}")
43
- continue # Hata durumunda işlemi sürdür
44
  return ' '.join(generated_texts)
45
 
46
  def process_large_text(text):
 
1
  import logging
2
  from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
3
+ from pdfminer.high_level import extract_text
4
+
5
 
 
6
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
7
  logger = logging.getLogger(__name__)
8
 
9
  pipe = pipeline("text2text-generation", model="google-t5/t5-base", device="cpu")
10
  pipe.model.config.pad_token_id = pipe.tokenizer.eos_token_id
11
 
12
+ fine_tuned_model_path = "./fine_tuned_model"
13
+ fine_tuned_model = T5ForConditionalGeneration.from_pretrained(fine_tuned_model_path)
14
+ fine_tuned_tokenizer = T5Tokenizer.from_pretrained(fine_tuned_model_path)
15
+
16
+
17
+ def pdf_to_text(pdf_path):
18
+ try:
19
+ logger.info(f"Extracting text from PDF: {pdf_path}")
20
+ return extract_text(pdf_path)
21
+ except Exception as e:
22
+ logger.error(f"Error while extracting text from PDF: {str(e)}")
23
+ raise ValueError(f"PDF extraction error: {str(e)}")
24
+
25
  def generate_lesson_from_transcript(doc_text):
26
  try:
27
+ logger.info("Generating lesson from transcript using general model.")
28
  generated_text = pipe(doc_text, max_length=100, truncation=True)[0]['generated_text']
29
  output_path = "/tmp/generated_output.txt"
30
 
 
38
  logger.error(f"Error occurred during lesson generation: {str(e)}")
39
  return "An error occurred", None
40
 
41
+ def refine_with_fine_tuned_model(general_output):
42
+ try:
43
+ logger.info("Refining the output with fine-tuned model.")
44
+ prompt = "Refine the following text for teaching purposes: " + general_output
45
+ inputs = fine_tuned_tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
46
+ output_ids = fine_tuned_model.generate(
47
+ inputs["input_ids"],
48
+ max_length=300,
49
+ no_repeat_ngram_size=3,
50
+ early_stopping=True
51
+ )
52
+ refined_text = fine_tuned_tokenizer.decode(output_ids[0], skip_special_tokens=True)
53
+ return refined_text
54
+ except Exception as e:
55
+ logger.error(f"Error during refinement with fine-tuned model: {str(e)}")
56
+ return "An error occurred during refinement."
57
+
58
  def split_text_into_chunks(text, chunk_size=1000):
59
  words = text.split()
60
  chunks = []
 
71
  generated_texts.append(generated_text)
72
  except Exception as e:
73
  print(f"Error in chunk processing: {str(e)}")
74
+ continue
75
  return ' '.join(generated_texts)
76
 
77
  def process_large_text(text):