Spaces:

RyanTietjen
/

Paper-Fragmentation

Build error

App Files Files

xet

Community

RyanTietjen commited on Sep 30, 2024

Commit

d85b3ec

verified ·

1 Parent(s): 9aabcb5

Upload 4 files

Browse files

Files changed (5) hide show

.gitattributes +1 -0
20k_5_epochs.keras +3 -0
app.py +183 -0
model.py +71 -0
process_input.py +67 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+20k_5_epochs.keras filter=lfs diff=lfs merge=lfs -text

20k_5_epochs.keras ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0565eca7a0095c40e516cd84b42354a964f0143f77c31c7329818a2853307f9
+size 1691202

app.py ADDED Viewed

	@@ -0,0 +1,183 @@

+"""
+Ryan Tietjen
+Sep 2024
+Demo application for paper abstract fragmentaion demonstration
+"""
+import gradio as gr
+import tensorflow as tf
+from tensorflow import keras
+from keras import layers
+from timeit import default_timer as timer
+from process_input import split_abstract
+from process_input import split_abstract_original
+from process_input import split_sentences_by_characters
+import pandas as pd
+import tensorflow_hub as hub
+from model import EmbeddingLayer
+from process_input import encode_labels
+sample_list = []
+example1 =  f"""The aim of this study was to describe the electrocardiographic ( ECG ) evolutionary changes after an acute myocardial infarction ( AMI ) and to evaluate their correlation with left ventricular function and remodeling.
+The QRS complex changes after AMI have been correlated with infarct size and left ventricular function.
+By contrast , the significance of T wave changes is controversial.
+We studied 536 patients enrolled in the GISSI-3-Echo substudy who underwent ECG and echocardiographic studies at 24 to 48 h ( S1 ) , at hospital discharge ( S2 ) , at six weeks ( S3 ) and six months ( S4 ) after AMI.
+The number of Qwaves ( nQ ) and QRS quantitative score ( QRSs ) did not change over time.
+From S2 to S4 , the number of negative T waves ( nT NEG ) decreased ( p < 0.0001 ) , wall motion abnormalities ( % WMA ) improved ( p < 0.001 ) , ventricular volumes increased ( p < 0.0001 ) while ejection fraction remained stable.
+According to the T wave changes after hospital discharge , patients were divided into four groups : stable positive T waves ( group 1 , n = 35 ) , patients who showed a decrease > or = 1 in nT NEG ( group 2 , n = 361 ) , patients with no change in nT NEG ( group 3 , n = 64 ) and those with an increase > or = 1 in nT NEG ( group 4 , n = 76 ).
+The QRSs and nQ remained stable in all groups.
+Groups 3 and 4 showed less recovery in % WMA , more pronounced ventricular enlargement and progressive decline in ejection fraction than groups 1 and 2 ( interaction time x groups p < 0.0001 ).
+The analysis of serial ECG can predict postinfarct left ventricular remodeling.
+Normalization of negative T waves during the follow-up appears more strictly related to recovery of regional dysfunction than QRS changes.
+Lack of resolution and late appearance of new negative T predict unfavorable remodeling with progressive deterioration of ventricular function."""
+sample_list.append(example1)
+def format_non_empty_lists(objective, background, methods, results, conclusion):
+    """
+    This function checks each provided list and formats a string with the list name and its contents
+    only if the list is not empty.
+    Parameters:
+    - objective (list): List containing sentences classified as 'Objective'.
+    - background (list): List containing sentences classified as 'Background'.
+    - methods (list): List containing sentences classified as 'Methods'.
+    - results (list): List containing sentences classified as 'Results'.
+    - conclusion (list): List containing sentences classified as 'Conclusion'.
+    Returns:
+    - str: A formatted string that contains the non-empty list names and their contents.
+    """
+    output = ""
+    lists = {
+        'Objective': objective,
+        'Background': background,
+        'Methods': methods,
+        'Results': results,
+        'Conclusion': conclusion
+    }
+    for name, content in lists.items():
+        if content:  # Check if the list is not empty
+            output += f"{name}:\n"  # Append the category name followed by a newline
+            for item in content:
+                output += f"  - {item}\n"  # Append each item in the list, formatted as a list
+            output += "\n"  # Append a newline for better separation between categories
+    return output.strip()
+def fragment_single_abstract(abstract):
+    """
+    Processes a single abstract by fragmenting it into structured sections based on predefined categories
+    such as Objective, Methods, Results, Conclusions, and Background. The function utilizes a pre-trained Keras model
+    to predict the category of each sentence in the abstract.
+    The process involves several steps:
+    1. Splitting the abstract into sentences.
+    2. Encoding these sentences using a custom embedding layer.
+    3. Classifying each sentence into one of the predefined categories.
+    4. Grouping the sentences by their predicted categories.
+    Parameters:
+    abstract (str): The abstract text that needs to be processed and categorized.
+    Returns:
+    tuple: A tuple containing two elements:
+        - A dictionary with keys as the category names ('Objective', 'Background', 'Methods', 'Results', 'Conclusions')
+          and values as lists of sentences belonging to these categories. Only non-empty categories are returned.
+        - The time taken to process the abstract (in seconds).
+    Example:
+    ```python
+    abstract_text = "This study aims to evaluate the effectiveness of..."
+    categorized_abstract, processing_time = fragment_single_abstract(abstract_text)
+    print("Categorized Abstract:", categorized_abstract)
+    print("Processing Time:", processing_time)
+    ```
+    Note:
+    - This function assumes that a Keras model 'test.keras' and a custom embedding layer 'EmbeddingLayer'
+      are available and correctly configured to be loaded.
+    - The function uses pandas for data manipulation, TensorFlow for machine learning operations,
+      and TensorFlow's data API for batching and prefetching data for model predictions.
+    """
+    start_time = timer()
+    original_abstract = split_abstract_original(abstract)
+    df_original = pd.DataFrame(original_abstract)
+    sentences_original = df_original["text"].tolist()
+    abstract_split = split_abstract(abstract)
+    df = pd.DataFrame(abstract_split)
+    sentences = df["text"].tolist()
+    labels = encode_labels(df["target"])
+    objective = []
+    background = []
+    methods = []
+    results = []
+    conclusion = []
+    embed_layer = EmbeddingLayer()
+    model = tf.keras.models.load_model("20k_5_epochs.keras", custom_objects={'EmbeddingLayer': embed_layer})
+    data_by_character = split_sentences_by_characters(sentences)
+    line_numbers = tf.one_hot(df["line_number"].to_numpy(), depth=15)
+    total_line_numbers = tf.one_hot(df["total_lines"].to_numpy(), depth=20)
+    sentences_dataset = tf.data.Dataset.from_tensor_slices((line_numbers, total_line_numbers, sentences, data_by_character))
+    labels_dataset = tf.data.Dataset.from_tensor_slices(labels)
+    dataset = tf.data.Dataset.zip((sentences_dataset, labels_dataset)).batch(32).prefetch(tf.data.AUTOTUNE)
+    predictions = tf.argmax(model.predict(dataset), axis=1)
+    for i, prediction in enumerate(predictions):
+        if prediction == 0:
+            objective.append(sentences_original[i])
+        elif prediction == 1:
+            methods.append(sentences_original[i])
+        elif prediction == 2:
+            results.append(sentences_original[i])
+        elif prediction == 3:
+            conclusion.append(sentences_original[i])
+        elif prediction == 4:
+            background.append(sentences_original[i])
+    end_time = timer()
+    return format_non_empty_lists(objective, background, methods, results, conclusion), end_time - start_time
+title = "Paper Abstract Fragmentation With TensorFlow by Ryan Tietjen"
+description = f"""
+This app will take the abstract of a paper and break it down into five categories: objective, background, methods, results, and conclusion.
+The dataset used can be found in the [PubMed 200k RCT]("https://arxiv.org/abs/1710.06071") and in [this repo](https://github.com/Franck-Dernoncourt/pubmed-rct). The model architecture
+was based off of ["Neural Networks for Joint Sentence Classification in Medical Paper Abstracts."](https://arxiv.org/pdf/1612.05251)
+This project achieved a testing accuracy of 88.12% and a F1 score of 87.92%. For the whole project, please visit [my GitHub](https://github.com/RyanTietjen/Paper-Fragmentation).
+How to use:
+-Paste the given abstract into the box below.
+-Make sure to separate each sentence by a new line (this helps avoid ambiguity).
+-Click submit, and allow the model to run!
+"""
+demo = gr.Interface(
+    fn=fragment_single_abstract,
+    inputs=gr.Textbox(lines=10, placeholder="Enter abstract here..."),
+    outputs=[
+        gr.Textbox(label="Fragmented Abstract"),
+        gr.Number(label="Time to process (s)"),
+    ],
+    examples=sample_list,
+    title=title,
+    description=description,
+)
+demo.launch(share=False)

model.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""
+Ryan Tietjen
+Sep 2024
+Create best model for the demo
+"""
+import tensorflow as tf
+from keras import layers
+import tensorflow_hub as hub
+class EmbeddingLayer(layers.Layer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # Hardcode the module URL directly within the layer
+        # self.module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
+        url = "https://tfhub.dev/google/universal-sentence-encoder/4"
+        self.embed_model = hub.KerasLayer(url, trainable=False, name="universal_sentence_encoder")
+    def call(self, inputs):
+        return self.embed_model(inputs)
+    def get_config(self):
+        config = super().get_config()
+        # The URL is now a fixed part of the layer, so it can be included in the config for completeness
+        # config.update({'module_url': self.module_url})
+        return config
+def create_token_model(token_embed):
+    input_layer = layers.Input(shape=[], dtype=tf.string)
+    embedding_layer = EmbeddingLayer()
+    token_embeddings = embedding_layer(input_layer)
+    output_layer = layers.Dense(128, activation="relu")(token_embeddings)
+    model = tf.keras.Model(input_layer, output_layer)
+    return model
+def create_character_vectorizer_model(char_embed, char_vectorizer):
+    input_layer = layers.Input(shape=(1,), dtype=tf.string)
+    char_vectors = char_vectorizer(input_layer) # vectorize text inputs
+    char_embedding = char_embed(char_vectors) # create embedding
+    output_layer = layers.Bidirectional(layers.LSTM(32))(char_embedding)
+    model = tf.keras.Model(input_layer, output_layer)
+    return model
+def create_line_number_model(input_shape, name):
+    input_layer = layers.Input(shape=(input_shape,), dtype=tf.int32, name=name)
+    output_layer = layers.Dense(32, activation="relu")(input_layer)
+    model = tf.keras.Model(input_layer, output_layer)
+    return model
+def tribrid_model(num_classes, token_embed, char_embed, text_vectorizer):
+    token_model = create_token_model(token_embed)
+    character_vectorizer_model = create_character_vectorizer_model(char_embed, text_vectorizer)
+    line_number_model = create_line_number_model(15, "line_number")
+    total_lines_model = create_line_number_model(20, "total_lines")
+    hybrid_model = layers.Concatenate(name="hybrid")([token_model.output,
+                                                      character_vectorizer_model.output])
+    dense_layer = layers.Dense(256, activation="relu")(hybrid_model)
+    dense_layer = layers.Dropout(0.5)(dense_layer)
+    tribrid_model = layers.Concatenate(name="tribrid") ([line_number_model.output, total_lines_model.output, dense_layer])
+    output_layer = layers.Dense(num_classes, activation="softmax")(tribrid_model)
+    model = tf.keras.Model([line_number_model.input, total_lines_model.input, token_model.input, character_vectorizer_model.input], output_layer)
+    model.compile(loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2),
+                  optimizer=tf.keras.optimizers.Adam(),
+                  metrics=["accuracy"])
+    return model

process_input.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""
+Sep 2024
+Ryan Tietjen
+Contains helper functions to process user input for the demo
+"""
+import pandas as pd
+def split_abstract(abstract):
+    results = []
+    lines = abstract.split("\n")
+    for i, line in enumerate(lines):
+        entry = {
+                "target": 0,
+                "text": line.lower(),
+                "line_number": i + 1,
+                "total_lines": len(lines)
+                }
+        results.append(entry)
+    return results
+def split_abstract_original(abstract):
+    results = []
+    lines = abstract.split("\n")
+    for i, line in enumerate(lines):
+        entry = {
+                "target": 0,
+                "text": line,
+                "line_number": i + 1,
+                "total_lines": len(lines)
+                }
+        results.append(entry)
+    return results
+def split_sentences_by_characters(corpus):
+    return [" ".join(sentence) for sentence in corpus]
+def encode_labels(*datasets):
+    """
+    Encode labels for multiple datasets using a unified label mapping.
+    Args:
+    *datasets: Arbitrary number of array-like structures containing labels.
+    Returns:
+    tuple: Encoded labels as numpy arrays for each dataset.
+    """
+    # Collect all labels from all datasets into a single list
+    all_labels = pd.concat([pd.Series(data) for data in datasets])
+    # Get unique labels and sort them to ensure consistency
+    unique_labels = pd.unique(all_labels)
+    unique_labels.sort()
+    # Create mapping from labels to integers
+    label_to_index = {label: idx for idx, label in enumerate(unique_labels)}
+    # Function to encode a single dataset
+    def encode_single_dataset(dataset, mapping):
+        return pd.Series(dataset).map(mapping).to_numpy()
+    # Encode all datasets using the mapping
+    encoded_datasets = tuple(encode_single_dataset(dataset, label_to_index) for dataset in datasets)
+    # Return only the encoded datasets
+    return encoded_datasets