Kaiyeee commited on
Commit
2e0a452
·
verified ·
1 Parent(s): a628757

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -20
app.py CHANGED
@@ -1,30 +1,134 @@
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
 
3
- def process_pdf(file_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  """
5
  Processes the uploaded PDF file and returns a bullet summary and a wrapped paragraph summary.
6
  """
7
- # Use the content filter to extract text from relevant pages
8
- full_text = read_pdf_with_content_filter(file_path)
9
-
10
- # Clean the extracted text
11
  cleaned_text = clean_text(full_text)
12
-
13
- # Attempt to extract core sections
14
  sections = extract_core_sections(cleaned_text)
15
  if not sections:
16
- # If no sections found, use fallback extraction
17
  core_text = remove_header_metadata(cleaned_text)
18
  else:
19
- # Combine core sections in a preferred order
20
  order = ['abstract', 'introduction', 'methods', 'results', 'conclusions', 'discussion']
21
  core_content = [sections[sec] for sec in order if sec in sections]
22
  core_text = " ".join(core_content) if core_content else cleaned_text
23
-
24
- # Split the core text into chunks
25
  chunks = split_into_chunks(core_text, chunk_size=500)
26
-
27
- # Summarize each chunk individually
28
  chunk_summaries = []
29
  for chunk in chunks:
30
  try:
@@ -32,18 +136,13 @@ def process_pdf(file_path):
32
  except Exception as e:
33
  chunk_summary = ""
34
  chunk_summaries.append(chunk_summary)
35
-
36
- # Combine chunk summaries and perform a final summarization
37
  final_core_summary_text = " ".join(chunk_summaries)
38
  final_summary = summarize_text(final_core_summary_text, max_length=200, min_length=50)
39
-
40
- # Format the final summary as bullet points and wrap as a paragraph
41
  bullet_points = format_bullet_points(final_summary)
42
  paragraph_summary_wrapped = bullet_to_paragraph_wrapped(bullet_points, width=80)
43
-
44
  return bullet_points, paragraph_summary_wrapped
45
 
46
- # Create a Gradio interface with a file upload input and two text outputs.
47
  iface = gr.Interface(
48
  fn=process_pdf,
49
  inputs=gr.File(label="Upload a Medical PDF"),
@@ -55,4 +154,5 @@ iface = gr.Interface(
55
  description="Upload a medical PDF document to get a summarized bullet-point and paragraph summary of its core content."
56
  )
57
 
58
- iface.launch()
 
 
1
+ # app.py
2
+
3
+ import re
4
+ import nltk
5
+ import spacy
6
+ import fitz # PyMuPDF
7
+ from transformers import pipeline
8
+ import textwrap
9
  import gradio as gr
10
 
11
+ # Download NLTK punkt if not already done
12
+ nltk.download('punkt')
13
+
14
+ # Load spaCy model
15
+ nlp = spacy.load("en_core_web_sm")
16
+
17
+ # Initialize the BigBird-Pegasus summarization pipeline for PubMed texts
18
+ summarizer = pipeline("summarization", model="google/bigbird-pegasus-large-pubmed")
19
+
20
+ # Helper Function: Read PDF with Content Filter
21
+ def read_pdf_with_content_filter(file_path, keywords=["Abstract", "Introduction", "Methods", "Results", "Conclusions"]):
22
+ """
23
+ Reads a PDF file and returns text only from pages that contain one of the specified keywords.
24
+ This helps exclude pages that mainly contain header/metadata.
25
+ """
26
+ doc = fitz.open(file_path)
27
+ content_pages = []
28
+ for i in range(len(doc)):
29
+ page_text = doc[i].get_text()
30
+ if any(keyword.lower() in page_text.lower() for keyword in keywords):
31
+ content_pages.append(page_text)
32
+ return "\n".join(content_pages)
33
+
34
+ # Helper Function: Clean Text
35
+ def clean_text(text):
36
+ """
37
+ Cleans the text by removing citations, extra whitespace, and unwanted characters.
38
+ """
39
+ text = re.sub(r'\[\d+\]', '', text) # Remove citations like [12]
40
+ text = re.sub(r'\(\d+\)', '', text) # Remove citations like (3)
41
+ text = re.sub(r'\s+', ' ', text) # Normalize whitespace
42
+ return text.strip()
43
+
44
+ # Helper Function: Extract Core Sections
45
+ def extract_core_sections(text):
46
+ """
47
+ Attempts to extract core sections using common headings.
48
+ Returns a dictionary with section name (lowercase) as key and its content as value.
49
+ """
50
+ pattern = r'(?i)(Abstract|Introduction|Methods|Results|Conclusions|Discussion)\s*[:\n\.]'
51
+ splits = re.split(pattern, text)
52
+ sections = {}
53
+ if len(splits) > 1:
54
+ for i in range(1, len(splits), 2):
55
+ heading = splits[i].strip().lower()
56
+ content = splits[i+1].strip() if i+1 < len(splits) else ""
57
+ sections[heading] = content
58
+ return sections
59
+
60
+ # Helper Function: Remove Header Metadata
61
+ def remove_header_metadata(text, marker="Competing Interests:"):
62
+ """
63
+ Removes header/metadata from the text by using a marker.
64
+ If the marker is found, returns text after it; otherwise, returns the original text.
65
+ """
66
+ idx = text.find(marker)
67
+ if idx != -1:
68
+ return text[idx + len(marker):].strip()
69
+ return text
70
+
71
+ # Helper Function: Split Text into Chunks
72
+ def split_into_chunks(text, chunk_size=500):
73
+ """
74
+ Splits the text into chunks of approximately chunk_size words.
75
+ """
76
+ words = text.split()
77
+ chunks = []
78
+ for i in range(0, len(words), chunk_size):
79
+ chunk = " ".join(words[i:i+chunk_size])
80
+ chunks.append(chunk)
81
+ return chunks
82
+
83
+ # Helper Function: Summarize Text
84
+ def summarize_text(text, max_length=200, min_length=50):
85
+ """
86
+ Summarizes the given text using BigBird-Pegasus.
87
+ Adjusts output lengths if the input is very short.
88
+ """
89
+ input_length = len(text.split())
90
+ if input_length < 60:
91
+ max_length = min(max_length, 40)
92
+ min_length = min(min_length, 10)
93
+ summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
94
+ return summary[0]['summary_text']
95
+
96
+ # Helper Function: Format Bullet Points
97
+ def format_bullet_points(summary):
98
+ """
99
+ Splits the summary into sentences and formats each as a bullet point.
100
+ """
101
+ sentences = nltk.sent_tokenize(summary)
102
+ bullets = ["- " + sentence for sentence in sentences]
103
+ return "\n".join(bullets)
104
+
105
+ # Helper Function: Convert Bullets to Wrapped Paragraph
106
+ def bullet_to_paragraph_wrapped(bullet_text, width=80):
107
+ """
108
+ Converts bullet point summary into a paragraph and wraps the text to a specified width.
109
+ """
110
+ paragraph = bullet_text.replace("- ", "").replace("<n>", " ")
111
+ paragraph = re.sub(r'\s+', ' ', paragraph).strip()
112
+ wrapped_paragraph = textwrap.fill(paragraph, width=width)
113
+ return wrapped_paragraph
114
+
115
+ # Process PDF Function (Gradio Interface)
116
+ def process_pdf(file_obj):
117
  """
118
  Processes the uploaded PDF file and returns a bullet summary and a wrapped paragraph summary.
119
  """
120
+ # file_obj is a temporary file path provided by Gradio
121
+ full_text = read_pdf_with_content_filter(file_obj.name)
 
 
122
  cleaned_text = clean_text(full_text)
 
 
123
  sections = extract_core_sections(cleaned_text)
124
  if not sections:
 
125
  core_text = remove_header_metadata(cleaned_text)
126
  else:
 
127
  order = ['abstract', 'introduction', 'methods', 'results', 'conclusions', 'discussion']
128
  core_content = [sections[sec] for sec in order if sec in sections]
129
  core_text = " ".join(core_content) if core_content else cleaned_text
130
+
 
131
  chunks = split_into_chunks(core_text, chunk_size=500)
 
 
132
  chunk_summaries = []
133
  for chunk in chunks:
134
  try:
 
136
  except Exception as e:
137
  chunk_summary = ""
138
  chunk_summaries.append(chunk_summary)
 
 
139
  final_core_summary_text = " ".join(chunk_summaries)
140
  final_summary = summarize_text(final_core_summary_text, max_length=200, min_length=50)
 
 
141
  bullet_points = format_bullet_points(final_summary)
142
  paragraph_summary_wrapped = bullet_to_paragraph_wrapped(bullet_points, width=80)
 
143
  return bullet_points, paragraph_summary_wrapped
144
 
145
+ # Create Gradio Interface
146
  iface = gr.Interface(
147
  fn=process_pdf,
148
  inputs=gr.File(label="Upload a Medical PDF"),
 
154
  description="Upload a medical PDF document to get a summarized bullet-point and paragraph summary of its core content."
155
  )
156
 
157
+ if __name__ == "__main__":
158
+ iface.launch()