GurgenGulay commited on
Commit
707262f
·
verified ·
1 Parent(s): 423ad6c

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +46 -31
utils.py CHANGED
@@ -1,7 +1,7 @@
1
  import logging
 
 
2
  from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
3
- from pdfminer.high_level import extract_text
4
- from fine_tuning import fine_tune_model
5
 
6
 
7
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -15,13 +15,52 @@ fine_tuned_model = T5ForConditionalGeneration.from_pretrained(fine_tuned_model_p
15
  fine_tuned_tokenizer = T5Tokenizer.from_pretrained(fine_tuned_model_path)
16
 
17
 
18
- def pdf_to_text(pdf_path):
19
  try:
20
- logger.info(f"Extracting text from PDF: {pdf_path}")
21
- return extract_text(pdf_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  except Exception as e:
23
- logger.error(f"Error while extracting text from PDF: {str(e)}")
24
- raise ValueError(f"PDF extraction error: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def generate_lesson_from_transcript(doc_text):
27
  try:
@@ -55,27 +94,3 @@ def refine_with_fine_tuned_model(general_output):
55
  except Exception as e:
56
  logger.error(f"Error during refinement with fine-tuned model: {str(e)}")
57
  return "An error occurred during refinement."
58
-
59
- def split_text_into_chunks(text, chunk_size=1000):
60
- words = text.split()
61
- chunks = []
62
- for i in range(0, len(words), chunk_size):
63
- chunk = ' '.join(words[i:i+chunk_size])
64
- chunks.append(chunk)
65
- return chunks
66
-
67
- def generate_lesson_from_chunks(chunks):
68
- generated_texts = []
69
- for chunk in chunks:
70
- try:
71
- generated_text = pipe(chunk, max_length=500, truncation=True)[0]['generated_text']
72
- generated_texts.append(generated_text)
73
- except Exception as e:
74
- print(f"Error in chunk processing: {str(e)}")
75
- continue
76
- return ' '.join(generated_texts)
77
-
78
- def process_large_text(text):
79
- chunks = split_text_into_chunks(text, chunk_size=1000)
80
- generated_text = generate_lesson_from_chunks(chunks)
81
- return generated_text
 
1
  import logging
2
+ import os
3
+ import fitz
4
  from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
 
 
5
 
6
 
7
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 
15
  fine_tuned_tokenizer = T5Tokenizer.from_pretrained(fine_tuned_model_path)
16
 
17
 
18
+ def extract_text_from_pdf(pdf_path):
19
  try:
20
+ if not os.path.exists(pdf_path):
21
+ raise FileNotFoundError(f"PDF file '{pdf_path}' does not exist.")
22
+
23
+ # PDF dosyasından metni çıkar
24
+ document = fitz.open(pdf_path)
25
+ text = ""
26
+ for page_num in range(document.page_count):
27
+ page = document.load_page(page_num)
28
+ text += page.get_text("text")
29
+
30
+ print(f"Text extraction successful from {pdf_path}.")
31
+ return text
32
+ except FileNotFoundError as e:
33
+ print(f"Error: {e}")
34
+ raise e
35
  except Exception as e:
36
+ print(f"An error occurred while extracting text from PDF: {e}")
37
+ raise e
38
+
39
+ def split_text_into_chunks(text, chunk_size=1000):
40
+ words = text.split()
41
+ chunks = []
42
+ for i in range(0, len(words), chunk_size):
43
+ chunk = ' '.join(words[i:i+chunk_size])
44
+ chunks.append(chunk)
45
+ return chunks
46
+
47
+ def batch_process_texts(texts, batch_size=2):
48
+ batched_results = []
49
+ for i in range(0, len(texts), batch_size):
50
+ batch = texts[i:i+batch_size]
51
+ try:
52
+ combined_text = " ".join(batch)
53
+ processed_text = some_processing_function(combined_text)
54
+ batched_results.append(processed_text)
55
+ except Exception as e:
56
+ print(f"Error processing batch {i // batch_size + 1}: {e}")
57
+ continue
58
+
59
+ return batched_results
60
+
61
+ def generate_lesson_from_chunks(chunks):
62
+ generated_texts = batch_process_texts(chunks)
63
+ return ' '.join(generated_texts)
64
 
65
  def generate_lesson_from_transcript(doc_text):
66
  try:
 
94
  except Exception as e:
95
  logger.error(f"Error during refinement with fine-tuned model: {str(e)}")
96
  return "An error occurred during refinement."