Spaces:
Sleeping
Sleeping
Use transformer tokenizer to make chunks
Browse filesBased off https://gist.github.com/saprativa/b5cb639e0c035876e0dd3c46e5a380fd
Replaces rudementary and inaccurant method
app.py
CHANGED
|
@@ -5,7 +5,7 @@ import json
|
|
| 5 |
import streamlit as st
|
| 6 |
from googleapiclient.discovery import build
|
| 7 |
from slugify import slugify
|
| 8 |
-
from transformers import pipeline
|
| 9 |
import uuid
|
| 10 |
import spacy
|
| 11 |
from spacy.matcher import PhraseMatcher
|
|
@@ -93,7 +93,7 @@ def get_summary( url, keywords ):
|
|
| 93 |
content = prep_chunks_summary( strings, keywords )
|
| 94 |
# Save content to cache file.
|
| 95 |
with open( content_cache, 'w' ) as file:
|
| 96 |
-
print(content, file=file)
|
| 97 |
|
| 98 |
max_lenth = 200
|
| 99 |
# Rudementary method to count number of tokens in a chunk.
|
|
@@ -178,25 +178,25 @@ def filter_sentences_by_keywords( strings, keywords ):
|
|
| 178 |
|
| 179 |
return sentences
|
| 180 |
|
| 181 |
-
def split_content_into_chunks( sentences ):
|
| 182 |
"""
|
| 183 |
Split content into chunks.
|
| 184 |
"""
|
| 185 |
-
|
| 186 |
-
|
| 187 |
chunks = []
|
| 188 |
-
# Loop through sentences and split into chunks.
|
| 189 |
for sentence in sentences:
|
| 190 |
-
#
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
|
|
|
| 194 |
chunks.append(chunk)
|
| 195 |
chunk = '' # Reset chunk.
|
| 196 |
-
|
| 197 |
|
| 198 |
# Add sentence to chunk.
|
| 199 |
-
|
| 200 |
chunk += sentence + ' '
|
| 201 |
|
| 202 |
chunks.append(chunk)
|
|
@@ -208,29 +208,37 @@ def prep_chunks_summary( strings, keywords ):
|
|
| 208 |
Chunk summary.
|
| 209 |
"""
|
| 210 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
sentences = filter_sentences_by_keywords( strings, keywords )
|
| 212 |
-
chunks
|
| 213 |
|
|
|
|
| 214 |
number_of_chunks = len( chunks )
|
| 215 |
# Loop through chunks if there are more than one.
|
| 216 |
if number_of_chunks > 1:
|
| 217 |
-
# Calculate the max summary length based on the number of chunks so that the final combined text is not longer than
|
| 218 |
-
max_length = int(
|
| 219 |
|
| 220 |
-
content = ''
|
| 221 |
# Loop through chunks and generate summary.
|
| 222 |
for chunk in chunks:
|
| 223 |
-
#
|
| 224 |
-
chunk_length = len(
|
| 225 |
# If chunk is shorter than max length, divide chunk length by 2.
|
| 226 |
if chunk_length < max_length:
|
| 227 |
max_length = int( chunk_length / 2 )
|
| 228 |
|
| 229 |
# Generate summary for chunk.
|
| 230 |
-
|
|
|
|
|
|
|
|
|
|
| 231 |
for summary in chunk_summary:
|
| 232 |
content += summary['summary_text'] + ' '
|
| 233 |
-
|
|
|
|
| 234 |
content = chunks[0]
|
| 235 |
|
| 236 |
return content
|
|
|
|
| 5 |
import streamlit as st
|
| 6 |
from googleapiclient.discovery import build
|
| 7 |
from slugify import slugify
|
| 8 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
| 9 |
import uuid
|
| 10 |
import spacy
|
| 11 |
from spacy.matcher import PhraseMatcher
|
|
|
|
| 93 |
content = prep_chunks_summary( strings, keywords )
|
| 94 |
# Save content to cache file.
|
| 95 |
with open( content_cache, 'w' ) as file:
|
| 96 |
+
print(content.strip(), file=file)
|
| 97 |
|
| 98 |
max_lenth = 200
|
| 99 |
# Rudementary method to count number of tokens in a chunk.
|
|
|
|
| 178 |
|
| 179 |
return sentences
|
| 180 |
|
| 181 |
+
def split_content_into_chunks( sentences, tokenizer ):
|
| 182 |
"""
|
| 183 |
Split content into chunks.
|
| 184 |
"""
|
| 185 |
+
combined_length = 0
|
| 186 |
+
chunk = ""
|
| 187 |
chunks = []
|
|
|
|
| 188 |
for sentence in sentences:
|
| 189 |
+
# Lenth of tokens in sentence.
|
| 190 |
+
length = len( tokenizer.tokenize( sentence ) )
|
| 191 |
+
|
| 192 |
+
# If the combined token length plus the current sentence is larger then max length, start a new chunk.
|
| 193 |
+
if combined_length + length > tokenizer.max_len_single_sentence:
|
| 194 |
chunks.append(chunk)
|
| 195 |
chunk = '' # Reset chunk.
|
| 196 |
+
combined_length = 0 # Reset token length.
|
| 197 |
|
| 198 |
# Add sentence to chunk.
|
| 199 |
+
combined_length += length
|
| 200 |
chunk += sentence + ' '
|
| 201 |
|
| 202 |
chunks.append(chunk)
|
|
|
|
| 208 |
Chunk summary.
|
| 209 |
"""
|
| 210 |
try:
|
| 211 |
+
checkpoint = "sshleifer/distilbart-cnn-12-6"
|
| 212 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
| 213 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
|
| 214 |
+
|
| 215 |
sentences = filter_sentences_by_keywords( strings, keywords )
|
| 216 |
+
chunks = split_content_into_chunks( sentences, tokenizer )
|
| 217 |
|
| 218 |
+
content = ''
|
| 219 |
number_of_chunks = len( chunks )
|
| 220 |
# Loop through chunks if there are more than one.
|
| 221 |
if number_of_chunks > 1:
|
| 222 |
+
# Calculate the max summary length based on the number of chunks so that the final combined text is not longer than max tokens.
|
| 223 |
+
max_length = int( tokenizer.max_len_single_sentence / number_of_chunks )
|
| 224 |
|
|
|
|
| 225 |
# Loop through chunks and generate summary.
|
| 226 |
for chunk in chunks:
|
| 227 |
+
# Number of tokens in a chunk.
|
| 228 |
+
chunk_length = len( tokenizer.tokenize( chunk ) )
|
| 229 |
# If chunk is shorter than max length, divide chunk length by 2.
|
| 230 |
if chunk_length < max_length:
|
| 231 |
max_length = int( chunk_length / 2 )
|
| 232 |
|
| 233 |
# Generate summary for chunk.
|
| 234 |
+
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
|
| 235 |
+
# https://huggingface.co/docs/transformers/v4.18.0/en/main_classes/pipelines#transformers.SummarizationPipeline
|
| 236 |
+
chunk_summary = summarizer(chunk, max_length, min_length=10, do_sample=False, truncation=True)
|
| 237 |
+
|
| 238 |
for summary in chunk_summary:
|
| 239 |
content += summary['summary_text'] + ' '
|
| 240 |
+
|
| 241 |
+
elif number_of_chunks == 1:
|
| 242 |
content = chunks[0]
|
| 243 |
|
| 244 |
return content
|