RyanTietjen commited on
Commit
d85b3ec
·
verified ·
1 Parent(s): 9aabcb5

Upload 4 files

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. 20k_5_epochs.keras +3 -0
  3. app.py +183 -0
  4. model.py +71 -0
  5. process_input.py +67 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ 20k_5_epochs.keras filter=lfs diff=lfs merge=lfs -text
20k_5_epochs.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0565eca7a0095c40e516cd84b42354a964f0143f77c31c7329818a2853307f9
3
+ size 1691202
app.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ryan Tietjen
3
+ Sep 2024
4
+ Demo application for paper abstract fragmentaion demonstration
5
+ """
6
+ import gradio as gr
7
+ import tensorflow as tf
8
+ from tensorflow import keras
9
+ from keras import layers
10
+ from timeit import default_timer as timer
11
+ from process_input import split_abstract
12
+ from process_input import split_abstract_original
13
+ from process_input import split_sentences_by_characters
14
+ import pandas as pd
15
+ import tensorflow_hub as hub
16
+ from model import EmbeddingLayer
17
+ from process_input import encode_labels
18
+
19
+
20
+ sample_list = []
21
+ example1 = f"""The aim of this study was to describe the electrocardiographic ( ECG ) evolutionary changes after an acute myocardial infarction ( AMI ) and to evaluate their correlation with left ventricular function and remodeling.
22
+ The QRS complex changes after AMI have been correlated with infarct size and left ventricular function.
23
+ By contrast , the significance of T wave changes is controversial.
24
+ We studied 536 patients enrolled in the GISSI-3-Echo substudy who underwent ECG and echocardiographic studies at 24 to 48 h ( S1 ) , at hospital discharge ( S2 ) , at six weeks ( S3 ) and six months ( S4 ) after AMI.
25
+ The number of Qwaves ( nQ ) and QRS quantitative score ( QRSs ) did not change over time.
26
+ From S2 to S4 , the number of negative T waves ( nT NEG ) decreased ( p < 0.0001 ) , wall motion abnormalities ( % WMA ) improved ( p < 0.001 ) , ventricular volumes increased ( p < 0.0001 ) while ejection fraction remained stable.
27
+ According to the T wave changes after hospital discharge , patients were divided into four groups : stable positive T waves ( group 1 , n = 35 ) , patients who showed a decrease > or = 1 in nT NEG ( group 2 , n = 361 ) , patients with no change in nT NEG ( group 3 , n = 64 ) and those with an increase > or = 1 in nT NEG ( group 4 , n = 76 ).
28
+ The QRSs and nQ remained stable in all groups.
29
+ Groups 3 and 4 showed less recovery in % WMA , more pronounced ventricular enlargement and progressive decline in ejection fraction than groups 1 and 2 ( interaction time x groups p < 0.0001 ).
30
+ The analysis of serial ECG can predict postinfarct left ventricular remodeling.
31
+ Normalization of negative T waves during the follow-up appears more strictly related to recovery of regional dysfunction than QRS changes.
32
+ Lack of resolution and late appearance of new negative T predict unfavorable remodeling with progressive deterioration of ventricular function."""
33
+ sample_list.append(example1)
34
+
35
+ def format_non_empty_lists(objective, background, methods, results, conclusion):
36
+ """
37
+ This function checks each provided list and formats a string with the list name and its contents
38
+ only if the list is not empty.
39
+
40
+ Parameters:
41
+ - objective (list): List containing sentences classified as 'Objective'.
42
+ - background (list): List containing sentences classified as 'Background'.
43
+ - methods (list): List containing sentences classified as 'Methods'.
44
+ - results (list): List containing sentences classified as 'Results'.
45
+ - conclusion (list): List containing sentences classified as 'Conclusion'.
46
+
47
+ Returns:
48
+ - str: A formatted string that contains the non-empty list names and their contents.
49
+ """
50
+
51
+ output = ""
52
+ lists = {
53
+ 'Objective': objective,
54
+ 'Background': background,
55
+ 'Methods': methods,
56
+ 'Results': results,
57
+ 'Conclusion': conclusion
58
+ }
59
+
60
+ for name, content in lists.items():
61
+ if content: # Check if the list is not empty
62
+ output += f"{name}:\n" # Append the category name followed by a newline
63
+ for item in content:
64
+ output += f" - {item}\n" # Append each item in the list, formatted as a list
65
+
66
+ output += "\n" # Append a newline for better separation between categories
67
+
68
+ return output.strip()
69
+
70
+ def fragment_single_abstract(abstract):
71
+ """
72
+ Processes a single abstract by fragmenting it into structured sections based on predefined categories
73
+ such as Objective, Methods, Results, Conclusions, and Background. The function utilizes a pre-trained Keras model
74
+ to predict the category of each sentence in the abstract.
75
+
76
+ The process involves several steps:
77
+ 1. Splitting the abstract into sentences.
78
+ 2. Encoding these sentences using a custom embedding layer.
79
+ 3. Classifying each sentence into one of the predefined categories.
80
+ 4. Grouping the sentences by their predicted categories.
81
+
82
+ Parameters:
83
+ abstract (str): The abstract text that needs to be processed and categorized.
84
+
85
+ Returns:
86
+ tuple: A tuple containing two elements:
87
+ - A dictionary with keys as the category names ('Objective', 'Background', 'Methods', 'Results', 'Conclusions')
88
+ and values as lists of sentences belonging to these categories. Only non-empty categories are returned.
89
+ - The time taken to process the abstract (in seconds).
90
+
91
+ Example:
92
+ ```python
93
+ abstract_text = "This study aims to evaluate the effectiveness of..."
94
+ categorized_abstract, processing_time = fragment_single_abstract(abstract_text)
95
+ print("Categorized Abstract:", categorized_abstract)
96
+ print("Processing Time:", processing_time)
97
+ ```
98
+
99
+ Note:
100
+ - This function assumes that a Keras model 'test.keras' and a custom embedding layer 'EmbeddingLayer'
101
+ are available and correctly configured to be loaded.
102
+ - The function uses pandas for data manipulation, TensorFlow for machine learning operations,
103
+ and TensorFlow's data API for batching and prefetching data for model predictions.
104
+ """
105
+ start_time = timer()
106
+
107
+ original_abstract = split_abstract_original(abstract)
108
+ df_original = pd.DataFrame(original_abstract)
109
+ sentences_original = df_original["text"].tolist()
110
+
111
+ abstract_split = split_abstract(abstract)
112
+ df = pd.DataFrame(abstract_split)
113
+ sentences = df["text"].tolist()
114
+ labels = encode_labels(df["target"])
115
+
116
+ objective = []
117
+ background = []
118
+ methods = []
119
+ results = []
120
+ conclusion = []
121
+
122
+ embed_layer = EmbeddingLayer()
123
+ model = tf.keras.models.load_model("20k_5_epochs.keras", custom_objects={'EmbeddingLayer': embed_layer})
124
+
125
+ data_by_character = split_sentences_by_characters(sentences)
126
+ line_numbers = tf.one_hot(df["line_number"].to_numpy(), depth=15)
127
+ total_line_numbers = tf.one_hot(df["total_lines"].to_numpy(), depth=20)
128
+
129
+ sentences_dataset = tf.data.Dataset.from_tensor_slices((line_numbers, total_line_numbers, sentences, data_by_character))
130
+ labels_dataset = tf.data.Dataset.from_tensor_slices(labels)
131
+ dataset = tf.data.Dataset.zip((sentences_dataset, labels_dataset)).batch(32).prefetch(tf.data.AUTOTUNE)
132
+
133
+ predictions = tf.argmax(model.predict(dataset), axis=1)
134
+
135
+ for i, prediction in enumerate(predictions):
136
+ if prediction == 0:
137
+ objective.append(sentences_original[i])
138
+ elif prediction == 1:
139
+ methods.append(sentences_original[i])
140
+ elif prediction == 2:
141
+ results.append(sentences_original[i])
142
+ elif prediction == 3:
143
+ conclusion.append(sentences_original[i])
144
+ elif prediction == 4:
145
+ background.append(sentences_original[i])
146
+
147
+ end_time = timer()
148
+
149
+ return format_non_empty_lists(objective, background, methods, results, conclusion), end_time - start_time
150
+
151
+
152
+
153
+ title = "Paper Abstract Fragmentation With TensorFlow by Ryan Tietjen"
154
+ description = f"""
155
+ This app will take the abstract of a paper and break it down into five categories: objective, background, methods, results, and conclusion.
156
+ The dataset used can be found in the [PubMed 200k RCT]("https://arxiv.org/abs/1710.06071") and in [this repo](https://github.com/Franck-Dernoncourt/pubmed-rct). The model architecture
157
+ was based off of ["Neural Networks for Joint Sentence Classification in Medical Paper Abstracts."](https://arxiv.org/pdf/1612.05251)
158
+
159
+ This project achieved a testing accuracy of 88.12% and a F1 score of 87.92%. For the whole project, please visit [my GitHub](https://github.com/RyanTietjen/Paper-Fragmentation).
160
+
161
+ How to use:
162
+
163
+ -Paste the given abstract into the box below.
164
+
165
+ -Make sure to separate each sentence by a new line (this helps avoid ambiguity).
166
+
167
+ -Click submit, and allow the model to run!
168
+ """
169
+
170
+ demo = gr.Interface(
171
+ fn=fragment_single_abstract,
172
+ inputs=gr.Textbox(lines=10, placeholder="Enter abstract here..."),
173
+ outputs=[
174
+ gr.Textbox(label="Fragmented Abstract"),
175
+ gr.Number(label="Time to process (s)"),
176
+ ],
177
+ examples=sample_list,
178
+ title=title,
179
+ description=description,
180
+ )
181
+
182
+
183
+ demo.launch(share=False)
model.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ryan Tietjen
3
+ Sep 2024
4
+ Create best model for the demo
5
+ """
6
+ import tensorflow as tf
7
+ from keras import layers
8
+ import tensorflow_hub as hub
9
+
10
+ class EmbeddingLayer(layers.Layer):
11
+ def __init__(self, **kwargs):
12
+ super().__init__(**kwargs)
13
+ # Hardcode the module URL directly within the layer
14
+ # self.module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
15
+ url = "https://tfhub.dev/google/universal-sentence-encoder/4"
16
+ self.embed_model = hub.KerasLayer(url, trainable=False, name="universal_sentence_encoder")
17
+
18
+ def call(self, inputs):
19
+ return self.embed_model(inputs)
20
+
21
+ def get_config(self):
22
+ config = super().get_config()
23
+ # The URL is now a fixed part of the layer, so it can be included in the config for completeness
24
+ # config.update({'module_url': self.module_url})
25
+ return config
26
+
27
+ def create_token_model(token_embed):
28
+ input_layer = layers.Input(shape=[], dtype=tf.string)
29
+ embedding_layer = EmbeddingLayer()
30
+ token_embeddings = embedding_layer(input_layer)
31
+ output_layer = layers.Dense(128, activation="relu")(token_embeddings)
32
+ model = tf.keras.Model(input_layer, output_layer)
33
+ return model
34
+
35
+ def create_character_vectorizer_model(char_embed, char_vectorizer):
36
+ input_layer = layers.Input(shape=(1,), dtype=tf.string)
37
+ char_vectors = char_vectorizer(input_layer) # vectorize text inputs
38
+ char_embedding = char_embed(char_vectors) # create embedding
39
+ output_layer = layers.Bidirectional(layers.LSTM(32))(char_embedding)
40
+ model = tf.keras.Model(input_layer, output_layer)
41
+ return model
42
+
43
+ def create_line_number_model(input_shape, name):
44
+ input_layer = layers.Input(shape=(input_shape,), dtype=tf.int32, name=name)
45
+ output_layer = layers.Dense(32, activation="relu")(input_layer)
46
+ model = tf.keras.Model(input_layer, output_layer)
47
+ return model
48
+
49
+
50
+ def tribrid_model(num_classes, token_embed, char_embed, text_vectorizer):
51
+
52
+ token_model = create_token_model(token_embed)
53
+ character_vectorizer_model = create_character_vectorizer_model(char_embed, text_vectorizer)
54
+ line_number_model = create_line_number_model(15, "line_number")
55
+ total_lines_model = create_line_number_model(20, "total_lines")
56
+
57
+ hybrid_model = layers.Concatenate(name="hybrid")([token_model.output,
58
+ character_vectorizer_model.output])
59
+
60
+ dense_layer = layers.Dense(256, activation="relu")(hybrid_model)
61
+ dense_layer = layers.Dropout(0.5)(dense_layer)
62
+
63
+ tribrid_model = layers.Concatenate(name="tribrid") ([line_number_model.output, total_lines_model.output, dense_layer])
64
+ output_layer = layers.Dense(num_classes, activation="softmax")(tribrid_model)
65
+
66
+ model = tf.keras.Model([line_number_model.input, total_lines_model.input, token_model.input, character_vectorizer_model.input], output_layer)
67
+
68
+ model.compile(loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2),
69
+ optimizer=tf.keras.optimizers.Adam(),
70
+ metrics=["accuracy"])
71
+ return model
process_input.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sep 2024
3
+ Ryan Tietjen
4
+ Contains helper functions to process user input for the demo
5
+ """
6
+ import pandas as pd
7
+
8
+ def split_abstract(abstract):
9
+ results = []
10
+
11
+ lines = abstract.split("\n")
12
+ for i, line in enumerate(lines):
13
+ entry = {
14
+ "target": 0,
15
+ "text": line.lower(),
16
+ "line_number": i + 1,
17
+ "total_lines": len(lines)
18
+ }
19
+ results.append(entry)
20
+ return results
21
+
22
+ def split_abstract_original(abstract):
23
+ results = []
24
+
25
+ lines = abstract.split("\n")
26
+ for i, line in enumerate(lines):
27
+ entry = {
28
+ "target": 0,
29
+ "text": line,
30
+ "line_number": i + 1,
31
+ "total_lines": len(lines)
32
+ }
33
+ results.append(entry)
34
+ return results
35
+
36
+ def split_sentences_by_characters(corpus):
37
+ return [" ".join(sentence) for sentence in corpus]
38
+
39
+ def encode_labels(*datasets):
40
+ """
41
+ Encode labels for multiple datasets using a unified label mapping.
42
+
43
+ Args:
44
+ *datasets: Arbitrary number of array-like structures containing labels.
45
+
46
+ Returns:
47
+ tuple: Encoded labels as numpy arrays for each dataset.
48
+ """
49
+ # Collect all labels from all datasets into a single list
50
+ all_labels = pd.concat([pd.Series(data) for data in datasets])
51
+
52
+ # Get unique labels and sort them to ensure consistency
53
+ unique_labels = pd.unique(all_labels)
54
+ unique_labels.sort()
55
+
56
+ # Create mapping from labels to integers
57
+ label_to_index = {label: idx for idx, label in enumerate(unique_labels)}
58
+
59
+ # Function to encode a single dataset
60
+ def encode_single_dataset(dataset, mapping):
61
+ return pd.Series(dataset).map(mapping).to_numpy()
62
+
63
+ # Encode all datasets using the mapping
64
+ encoded_datasets = tuple(encode_single_dataset(dataset, label_to_index) for dataset in datasets)
65
+
66
+ # Return only the encoded datasets
67
+ return encoded_datasets