Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,30 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
"""
|
5 |
Processes the uploaded PDF file and returns a bullet summary and a wrapped paragraph summary.
|
6 |
"""
|
7 |
-
#
|
8 |
-
full_text = read_pdf_with_content_filter(
|
9 |
-
|
10 |
-
# Clean the extracted text
|
11 |
cleaned_text = clean_text(full_text)
|
12 |
-
|
13 |
-
# Attempt to extract core sections
|
14 |
sections = extract_core_sections(cleaned_text)
|
15 |
if not sections:
|
16 |
-
# If no sections found, use fallback extraction
|
17 |
core_text = remove_header_metadata(cleaned_text)
|
18 |
else:
|
19 |
-
# Combine core sections in a preferred order
|
20 |
order = ['abstract', 'introduction', 'methods', 'results', 'conclusions', 'discussion']
|
21 |
core_content = [sections[sec] for sec in order if sec in sections]
|
22 |
core_text = " ".join(core_content) if core_content else cleaned_text
|
23 |
-
|
24 |
-
# Split the core text into chunks
|
25 |
chunks = split_into_chunks(core_text, chunk_size=500)
|
26 |
-
|
27 |
-
# Summarize each chunk individually
|
28 |
chunk_summaries = []
|
29 |
for chunk in chunks:
|
30 |
try:
|
@@ -32,18 +136,13 @@ def process_pdf(file_path):
|
|
32 |
except Exception as e:
|
33 |
chunk_summary = ""
|
34 |
chunk_summaries.append(chunk_summary)
|
35 |
-
|
36 |
-
# Combine chunk summaries and perform a final summarization
|
37 |
final_core_summary_text = " ".join(chunk_summaries)
|
38 |
final_summary = summarize_text(final_core_summary_text, max_length=200, min_length=50)
|
39 |
-
|
40 |
-
# Format the final summary as bullet points and wrap as a paragraph
|
41 |
bullet_points = format_bullet_points(final_summary)
|
42 |
paragraph_summary_wrapped = bullet_to_paragraph_wrapped(bullet_points, width=80)
|
43 |
-
|
44 |
return bullet_points, paragraph_summary_wrapped
|
45 |
|
46 |
-
# Create
|
47 |
iface = gr.Interface(
|
48 |
fn=process_pdf,
|
49 |
inputs=gr.File(label="Upload a Medical PDF"),
|
@@ -55,4 +154,5 @@ iface = gr.Interface(
|
|
55 |
description="Upload a medical PDF document to get a summarized bullet-point and paragraph summary of its core content."
|
56 |
)
|
57 |
|
58 |
-
|
|
|
|
1 |
+
# app.py
|
2 |
+
|
3 |
+
import re
|
4 |
+
import nltk
|
5 |
+
import spacy
|
6 |
+
import fitz # PyMuPDF
|
7 |
+
from transformers import pipeline
|
8 |
+
import textwrap
|
9 |
import gradio as gr
|
10 |
|
11 |
+
# Download NLTK punkt if not already done
|
12 |
+
nltk.download('punkt')
|
13 |
+
|
14 |
+
# Load spaCy model
|
15 |
+
nlp = spacy.load("en_core_web_sm")
|
16 |
+
|
17 |
+
# Initialize the BigBird-Pegasus summarization pipeline for PubMed texts
|
18 |
+
summarizer = pipeline("summarization", model="google/bigbird-pegasus-large-pubmed")
|
19 |
+
|
20 |
+
# Helper Function: Read PDF with Content Filter
|
21 |
+
def read_pdf_with_content_filter(file_path, keywords=["Abstract", "Introduction", "Methods", "Results", "Conclusions"]):
|
22 |
+
"""
|
23 |
+
Reads a PDF file and returns text only from pages that contain one of the specified keywords.
|
24 |
+
This helps exclude pages that mainly contain header/metadata.
|
25 |
+
"""
|
26 |
+
doc = fitz.open(file_path)
|
27 |
+
content_pages = []
|
28 |
+
for i in range(len(doc)):
|
29 |
+
page_text = doc[i].get_text()
|
30 |
+
if any(keyword.lower() in page_text.lower() for keyword in keywords):
|
31 |
+
content_pages.append(page_text)
|
32 |
+
return "\n".join(content_pages)
|
33 |
+
|
34 |
+
# Helper Function: Clean Text
|
35 |
+
def clean_text(text):
|
36 |
+
"""
|
37 |
+
Cleans the text by removing citations, extra whitespace, and unwanted characters.
|
38 |
+
"""
|
39 |
+
text = re.sub(r'\[\d+\]', '', text) # Remove citations like [12]
|
40 |
+
text = re.sub(r'\(\d+\)', '', text) # Remove citations like (3)
|
41 |
+
text = re.sub(r'\s+', ' ', text) # Normalize whitespace
|
42 |
+
return text.strip()
|
43 |
+
|
44 |
+
# Helper Function: Extract Core Sections
|
45 |
+
def extract_core_sections(text):
|
46 |
+
"""
|
47 |
+
Attempts to extract core sections using common headings.
|
48 |
+
Returns a dictionary with section name (lowercase) as key and its content as value.
|
49 |
+
"""
|
50 |
+
pattern = r'(?i)(Abstract|Introduction|Methods|Results|Conclusions|Discussion)\s*[:\n\.]'
|
51 |
+
splits = re.split(pattern, text)
|
52 |
+
sections = {}
|
53 |
+
if len(splits) > 1:
|
54 |
+
for i in range(1, len(splits), 2):
|
55 |
+
heading = splits[i].strip().lower()
|
56 |
+
content = splits[i+1].strip() if i+1 < len(splits) else ""
|
57 |
+
sections[heading] = content
|
58 |
+
return sections
|
59 |
+
|
60 |
+
# Helper Function: Remove Header Metadata
|
61 |
+
def remove_header_metadata(text, marker="Competing Interests:"):
|
62 |
+
"""
|
63 |
+
Removes header/metadata from the text by using a marker.
|
64 |
+
If the marker is found, returns text after it; otherwise, returns the original text.
|
65 |
+
"""
|
66 |
+
idx = text.find(marker)
|
67 |
+
if idx != -1:
|
68 |
+
return text[idx + len(marker):].strip()
|
69 |
+
return text
|
70 |
+
|
71 |
+
# Helper Function: Split Text into Chunks
|
72 |
+
def split_into_chunks(text, chunk_size=500):
|
73 |
+
"""
|
74 |
+
Splits the text into chunks of approximately chunk_size words.
|
75 |
+
"""
|
76 |
+
words = text.split()
|
77 |
+
chunks = []
|
78 |
+
for i in range(0, len(words), chunk_size):
|
79 |
+
chunk = " ".join(words[i:i+chunk_size])
|
80 |
+
chunks.append(chunk)
|
81 |
+
return chunks
|
82 |
+
|
83 |
+
# Helper Function: Summarize Text
|
84 |
+
def summarize_text(text, max_length=200, min_length=50):
|
85 |
+
"""
|
86 |
+
Summarizes the given text using BigBird-Pegasus.
|
87 |
+
Adjusts output lengths if the input is very short.
|
88 |
+
"""
|
89 |
+
input_length = len(text.split())
|
90 |
+
if input_length < 60:
|
91 |
+
max_length = min(max_length, 40)
|
92 |
+
min_length = min(min_length, 10)
|
93 |
+
summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
|
94 |
+
return summary[0]['summary_text']
|
95 |
+
|
96 |
+
# Helper Function: Format Bullet Points
|
97 |
+
def format_bullet_points(summary):
|
98 |
+
"""
|
99 |
+
Splits the summary into sentences and formats each as a bullet point.
|
100 |
+
"""
|
101 |
+
sentences = nltk.sent_tokenize(summary)
|
102 |
+
bullets = ["- " + sentence for sentence in sentences]
|
103 |
+
return "\n".join(bullets)
|
104 |
+
|
105 |
+
# Helper Function: Convert Bullets to Wrapped Paragraph
|
106 |
+
def bullet_to_paragraph_wrapped(bullet_text, width=80):
|
107 |
+
"""
|
108 |
+
Converts bullet point summary into a paragraph and wraps the text to a specified width.
|
109 |
+
"""
|
110 |
+
paragraph = bullet_text.replace("- ", "").replace("<n>", " ")
|
111 |
+
paragraph = re.sub(r'\s+', ' ', paragraph).strip()
|
112 |
+
wrapped_paragraph = textwrap.fill(paragraph, width=width)
|
113 |
+
return wrapped_paragraph
|
114 |
+
|
115 |
+
# Process PDF Function (Gradio Interface)
|
116 |
+
def process_pdf(file_obj):
|
117 |
"""
|
118 |
Processes the uploaded PDF file and returns a bullet summary and a wrapped paragraph summary.
|
119 |
"""
|
120 |
+
# file_obj is a temporary file path provided by Gradio
|
121 |
+
full_text = read_pdf_with_content_filter(file_obj.name)
|
|
|
|
|
122 |
cleaned_text = clean_text(full_text)
|
|
|
|
|
123 |
sections = extract_core_sections(cleaned_text)
|
124 |
if not sections:
|
|
|
125 |
core_text = remove_header_metadata(cleaned_text)
|
126 |
else:
|
|
|
127 |
order = ['abstract', 'introduction', 'methods', 'results', 'conclusions', 'discussion']
|
128 |
core_content = [sections[sec] for sec in order if sec in sections]
|
129 |
core_text = " ".join(core_content) if core_content else cleaned_text
|
130 |
+
|
|
|
131 |
chunks = split_into_chunks(core_text, chunk_size=500)
|
|
|
|
|
132 |
chunk_summaries = []
|
133 |
for chunk in chunks:
|
134 |
try:
|
|
|
136 |
except Exception as e:
|
137 |
chunk_summary = ""
|
138 |
chunk_summaries.append(chunk_summary)
|
|
|
|
|
139 |
final_core_summary_text = " ".join(chunk_summaries)
|
140 |
final_summary = summarize_text(final_core_summary_text, max_length=200, min_length=50)
|
|
|
|
|
141 |
bullet_points = format_bullet_points(final_summary)
|
142 |
paragraph_summary_wrapped = bullet_to_paragraph_wrapped(bullet_points, width=80)
|
|
|
143 |
return bullet_points, paragraph_summary_wrapped
|
144 |
|
145 |
+
# Create Gradio Interface
|
146 |
iface = gr.Interface(
|
147 |
fn=process_pdf,
|
148 |
inputs=gr.File(label="Upload a Medical PDF"),
|
|
|
154 |
description="Upload a medical PDF document to get a summarized bullet-point and paragraph summary of its core content."
|
155 |
)
|
156 |
|
157 |
+
if __name__ == "__main__":
|
158 |
+
iface.launch()
|