Upload folder using huggingface_hub
Browse files- README.md +46 -0
- build/lib/headline_gen/Control.py +434 -0
- build/lib/headline_gen/__init__.py +0 -0
- dist/headline-gen-2.3.tar.gz +3 -0
- dist/headline-gen-2.4.tar.gz +3 -0
- dist/headline-gen-2.5.tar.gz +3 -0
- dist/headline-gen-2.6.tar.gz +3 -0
- dist/headline_gen-2.3-py3-none-any.whl +0 -0
- dist/headline_gen-2.4-py3-none-any.whl +0 -0
- dist/headline_gen-2.5-py3-none-any.whl +0 -0
- dist/headline_gen-2.6-py3-none-any.whl +0 -0
- headline_gen.egg-info/PKG-INFO +55 -0
- headline_gen.egg-info/SOURCES.txt +9 -0
- headline_gen.egg-info/dependency_links.txt +1 -0
- headline_gen.egg-info/requires.txt +10 -0
- headline_gen.egg-info/top_level.txt +1 -0
- headline_gen/Control.py +434 -0
- headline_gen/__init__.py +0 -0
- setup.py +24 -0
README.md
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# Headline Generation Package
|
3 |
+
|
4 |
+
This is a Python package for generating headlines from Articles.
|
5 |
+
|
6 |
+
## Installation
|
7 |
+
|
8 |
+
You can install the package using pip:
|
9 |
+
|
10 |
+
```bash
|
11 |
+
pip install headline-gen
|
12 |
+
```
|
13 |
+
|
14 |
+
## Usage
|
15 |
+
|
16 |
+
```python
|
17 |
+
from headline_gen.Control import ServerCntrl, Generate
|
18 |
+
|
19 |
+
# Run this once to start the server
|
20 |
+
Server = ServerCntrl("Start")
|
21 |
+
|
22 |
+
# Generate headline from article text
|
23 |
+
headline = Generate("Your article text goes here...", Server)
|
24 |
+
print(headline)
|
25 |
+
|
26 |
+
# Stop the server when done
|
27 |
+
ServerCntrl("Stop", Server)
|
28 |
+
```
|
29 |
+
|
30 |
+
## Description
|
31 |
+
|
32 |
+
This package provides functionality to generate headlines from article text using natural language processing techniques.
|
33 |
+
|
34 |
+
## Usage Instructions
|
35 |
+
|
36 |
+
1. Import the `ServerCntrl` and `Generate` functions from the `Control` module.
|
37 |
+
2. Start the server using `ServerCntrl("Start")`. This only needs to be done once.
|
38 |
+
3. Generate headlines using the `Generate` function, passing the article text as an argument.
|
39 |
+
4. Stop the server when done using `ServerCntrl("Stop", Server)`.
|
40 |
+
|
41 |
+
## New Release Features (v2.6) and Bug Fixes
|
42 |
+
|
43 |
+
1. Fixed a corner case issue causing a ZeroDivisionError when processing irregular parameters for phrase extraction. The package now gracefully handles such scenarios without disrupting functionality.
|
44 |
+
2. Renamed the function `ServerInit` to `ServerCntrl` for improved clarity and consistency within the codebase.
|
45 |
+
3. Additionally, streamlined the dependency management by directly including `en_core_web_sm` in the downloader module.
|
46 |
+
4. Output made more Comprehensive.
|
build/lib/headline_gen/Control.py
ADDED
@@ -0,0 +1,434 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import zipfile
|
3 |
+
import os
|
4 |
+
import nltk
|
5 |
+
|
6 |
+
from nltk.parse.corenlp import CoreNLPServer
|
7 |
+
|
8 |
+
from spacy_download import load_spacy
|
9 |
+
import textacy
|
10 |
+
from textacy import *
|
11 |
+
import string
|
12 |
+
|
13 |
+
import re
|
14 |
+
import numpy as np
|
15 |
+
from nltk.tokenize import sent_tokenize
|
16 |
+
from nltk.corpus import stopwords
|
17 |
+
from gensim.models import Word2Vec
|
18 |
+
from scipy.spatial import distance
|
19 |
+
import networkx as nx
|
20 |
+
|
21 |
+
#import string
|
22 |
+
from nltk.parse.corenlp import CoreNLPParser
|
23 |
+
from nltk.tree.tree import Tree
|
24 |
+
|
25 |
+
import torch
|
26 |
+
import numpy as np
|
27 |
+
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification
|
28 |
+
|
29 |
+
|
30 |
+
#import re
|
31 |
+
#import os
|
32 |
+
|
33 |
+
# Downloader
|
34 |
+
|
35 |
+
def Downloader():
|
36 |
+
directory_path = "Parser/stanford-corenlp-4.5.6"
|
37 |
+
if os.path.exists(directory_path) and os.listdir(directory_path):
|
38 |
+
pass
|
39 |
+
else:
|
40 |
+
nlp = load_spacy("en_core_web_sm")
|
41 |
+
nltk.download('punkt')
|
42 |
+
nltk.download('stopwords')
|
43 |
+
url = "https://nlp.stanford.edu/software/stanford-corenlp-4.5.6.zip"
|
44 |
+
|
45 |
+
filename = "stanford-corenlp-4.5.6.zip"
|
46 |
+
directory = "./Parser/"
|
47 |
+
|
48 |
+
os.makedirs(directory, exist_ok=True)
|
49 |
+
|
50 |
+
response = requests.get(url)
|
51 |
+
|
52 |
+
if response.status_code == 200:
|
53 |
+
with open(os.path.join(directory, filename), 'wb') as f:
|
54 |
+
f.write(response.content)
|
55 |
+
print("Download successful.")
|
56 |
+
|
57 |
+
with zipfile.ZipFile(os.path.join(directory, filename), 'r') as zip_ref:
|
58 |
+
zip_ref.extractall(directory)
|
59 |
+
print("Extraction successful.")
|
60 |
+
else:
|
61 |
+
print("Failed to download file.")
|
62 |
+
|
63 |
+
def ServerCntrl(Mode, Server = None):
|
64 |
+
Path = 'Parser/'
|
65 |
+
os.environ['CLASSPATH'] = os.path.join(Path, 'stanford-corenlp-4.5.6')
|
66 |
+
|
67 |
+
directory_path = "Parser/stanford-corenlp-4.5.6"
|
68 |
+
if os.path.exists(directory_path) and os.listdir(directory_path):
|
69 |
+
if Mode == "Start":
|
70 |
+
server = CoreNLPServer()
|
71 |
+
server.start()
|
72 |
+
return server
|
73 |
+
elif Mode == "Stop":
|
74 |
+
if Server is None:
|
75 |
+
print("No Server Object Provided")
|
76 |
+
else:
|
77 |
+
Server.stop()
|
78 |
+
else:
|
79 |
+
print("Un defined Operation")
|
80 |
+
else:
|
81 |
+
print("Parser Files Not Found")
|
82 |
+
print("Attempting to Install Parser Files (This may take a Min or Two!!)")
|
83 |
+
Downloader()
|
84 |
+
if Mode == "Start":
|
85 |
+
server = ServerCntrl("Start")
|
86 |
+
return server
|
87 |
+
|
88 |
+
# Key Phrase Extraction
|
89 |
+
|
90 |
+
def remove_punctuation(text):
|
91 |
+
table = str.maketrans('', '', string.punctuation)
|
92 |
+
return text.translate(table)
|
93 |
+
|
94 |
+
def KeyPhraseSGRank(Article):
|
95 |
+
en = textacy.load_spacy_lang("en_core_web_sm")
|
96 |
+
|
97 |
+
Article = remove_punctuation(Article)
|
98 |
+
|
99 |
+
doc = textacy.make_spacy_doc(Article, lang=en)
|
100 |
+
|
101 |
+
TopPhrases = [kps for kps, weights in textacy.extract.keyterms.sgrank(doc, ngrams = (1, 3), topn=1.0)]
|
102 |
+
if len(TopPhrases) != 0:
|
103 |
+
print("...Key Phrases Found...")
|
104 |
+
print(TopPhrases)
|
105 |
+
return TopPhrases
|
106 |
+
else:
|
107 |
+
print("No Specific Key Phrases Found, Terminating the Execution...")
|
108 |
+
exit()
|
109 |
+
|
110 |
+
# Lead Sentence Extraction
|
111 |
+
|
112 |
+
class LeadSentencesOOPS:
|
113 |
+
def __init__(self, df):
|
114 |
+
self.df = df
|
115 |
+
self.sentences = sent_tokenize(self.df)
|
116 |
+
|
117 |
+
def pre_process(self):
|
118 |
+
sentences_clean = [re.sub(r'[^\w\s]', '', sentence.lower()) for sentence in self.sentences]
|
119 |
+
stop_words = stopwords.words('english')
|
120 |
+
sentence_tokens = [[words for words in sentence.split(' ') if words not in stop_words] for sentence in sentences_clean]
|
121 |
+
return sentence_tokens
|
122 |
+
|
123 |
+
def count_paragraphs(self):
|
124 |
+
text=self.df
|
125 |
+
paragraphs = re.split(r'\n\s*\n', text)
|
126 |
+
return (paragraphs,len(paragraphs))
|
127 |
+
|
128 |
+
def word2vec(self):
|
129 |
+
sentence_tokens = self.pre_process()
|
130 |
+
w2v = Word2Vec(sentence_tokens, vector_size=1, min_count=1, epochs=1500)
|
131 |
+
sentence_embeddings = []
|
132 |
+
max_len = max(len(tokens) for tokens in sentence_tokens)
|
133 |
+
for words in sentence_tokens:
|
134 |
+
embedding = [w2v.wv[word] for word in words]
|
135 |
+
padding_length = max_len - len(embedding)
|
136 |
+
padded_embedding = np.pad(embedding, [(0, padding_length), (0, 0)], mode='constant')
|
137 |
+
sentence_embeddings.append(padded_embedding)
|
138 |
+
return sentence_embeddings
|
139 |
+
|
140 |
+
def similarity_matrix(self):
|
141 |
+
sentence_tokens = self.pre_process()
|
142 |
+
sentence_embeddings = self.word2vec()
|
143 |
+
similarity_matrix = np.zeros([len(sentence_tokens), len(sentence_tokens)])
|
144 |
+
for i, row_embedding in enumerate(sentence_embeddings):
|
145 |
+
for j, column_embedding in enumerate(sentence_embeddings):
|
146 |
+
similarity_matrix[i][j] = 1 - distance.cosine(row_embedding.ravel(), column_embedding.ravel())
|
147 |
+
return similarity_matrix
|
148 |
+
|
149 |
+
def num_of_leadingsentences(self):
|
150 |
+
num_sentences = len(self.sentences)
|
151 |
+
if num_sentences < 5:
|
152 |
+
top = 1
|
153 |
+
elif num_sentences < 10:
|
154 |
+
top = 2
|
155 |
+
elif num_sentences < 25:
|
156 |
+
top = 4
|
157 |
+
elif num_sentences < 50:
|
158 |
+
top = 9
|
159 |
+
elif num_sentences < 100:
|
160 |
+
top = 18
|
161 |
+
elif num_sentences < 200:
|
162 |
+
top = 25
|
163 |
+
elif num_sentences >= 201:
|
164 |
+
top = 40
|
165 |
+
return top
|
166 |
+
|
167 |
+
def text_rank(self,num_sentences_to_extract):
|
168 |
+
li=[]
|
169 |
+
similarity_matrixs = self.similarity_matrix()
|
170 |
+
nx_graph = nx.from_numpy_array(similarity_matrixs)
|
171 |
+
scores = nx.pagerank(nx_graph)
|
172 |
+
top_sentence = {sentence: scores[index] for index, sentence in enumerate(self.sentences)}
|
173 |
+
top = dict(sorted(top_sentence.items(), key=lambda x: x[1], reverse=True)[:num_sentences_to_extract])
|
174 |
+
for sent in self.sentences:
|
175 |
+
if sent in top.keys():
|
176 |
+
li.append(sent)
|
177 |
+
return li
|
178 |
+
|
179 |
+
def leading_sentences(self):
|
180 |
+
article_info = self.count_paragraphs()
|
181 |
+
leading_sentences=[]
|
182 |
+
#if there is only one para in article then num_of_leading sentences are selected based on fixed constant
|
183 |
+
if article_info[1] <= 3:
|
184 |
+
num_sentences_to_extract=self.num_of_leadingsentences()
|
185 |
+
LSG_article = LeadSentencesOOPS(str(article_info[0]))
|
186 |
+
leading_sentences.extend(LSG_article.text_rank(num_sentences_to_extract))
|
187 |
+
#leading_sentences_corpus.append(leading_sentences)
|
188 |
+
else:
|
189 |
+
num_sentences_to_extract=1 #if there are more than one paras in article
|
190 |
+
paragraphs = article_info[0]
|
191 |
+
#print("num_paras: ",paragraphs)
|
192 |
+
#extracting one leading sentence from each paragraph
|
193 |
+
for para in paragraphs:
|
194 |
+
LSG = LeadSentencesOOPS(para)
|
195 |
+
output = LSG.text_rank(num_sentences_to_extract)
|
196 |
+
leading_sentences.extend(output)
|
197 |
+
#extractig leading sentence from entire article
|
198 |
+
LSG_article = LeadSentencesOOPS(para)
|
199 |
+
leading_sentences.extend(LSG_article.text_rank(num_sentences_to_extract))
|
200 |
+
|
201 |
+
return leading_sentences
|
202 |
+
|
203 |
+
# Parsing and Compression Algo
|
204 |
+
|
205 |
+
def remove_punctuation(text):
|
206 |
+
table = str.maketrans('', '', string.punctuation)
|
207 |
+
return text.translate(table)
|
208 |
+
|
209 |
+
def Parsing(Sentence, Server):
|
210 |
+
parser = CoreNLPParser(url=Server.url)
|
211 |
+
return next(parser.raw_parse(Sentence))
|
212 |
+
|
213 |
+
def find_leftmost_S(tree):
|
214 |
+
if isinstance(tree, str): # Terminal node
|
215 |
+
return None
|
216 |
+
elif tree.label() == 'S': # Found leftmost S node
|
217 |
+
return tree
|
218 |
+
else:
|
219 |
+
for subtree in tree:
|
220 |
+
result = find_leftmost_S(subtree)
|
221 |
+
if result is not None:
|
222 |
+
return result
|
223 |
+
|
224 |
+
def Pruning(tree, Label):
|
225 |
+
if isinstance(tree, str):
|
226 |
+
return tree
|
227 |
+
if tree.height() > 0:
|
228 |
+
filtered_children = [Pruning(child, Label) for child in tree if (isinstance(child, str) or child.height() > 0) and (isinstance(child, str) or child.label() != Label)]
|
229 |
+
return Tree(tree.label(), filtered_children)
|
230 |
+
else:
|
231 |
+
return tree
|
232 |
+
|
233 |
+
def IterativeTrimming(HeadLine, SGRankList, Threshold):
|
234 |
+
if len(HeadLine) > Threshold:
|
235 |
+
if len(SGRankList) > 0:
|
236 |
+
ptr = SGRankList[-1]
|
237 |
+
else:
|
238 |
+
return HeadLine
|
239 |
+
if HeadLine.find(ptr) > 0:
|
240 |
+
if HeadLine[HeadLine.find(ptr) - 1] != ' ':
|
241 |
+
HeadLine = HeadLine.replace(ptr, ":", 1)
|
242 |
+
else:
|
243 |
+
HeadLine = HeadLine.replace(' ' + ptr, "", 1)
|
244 |
+
else:
|
245 |
+
HeadLine = HeadLine.replace(ptr + ' ', "", 1)
|
246 |
+
return IterativeTrimming(HeadLine, SGRankList[: len(SGRankList) - 1], Threshold)
|
247 |
+
else:
|
248 |
+
return HeadLine
|
249 |
+
|
250 |
+
def Extract(Treex):
|
251 |
+
k = Treex.leaves()
|
252 |
+
Trex = ''
|
253 |
+
for i in k:
|
254 |
+
Trex += i + ' '
|
255 |
+
return Trex
|
256 |
+
|
257 |
+
def CompressionAlgorithm(LeadSents, TopPhrases, server):
|
258 |
+
CompressedSentences = []
|
259 |
+
for i in LeadSents:
|
260 |
+
Suppy = remove_punctuation(i)
|
261 |
+
|
262 |
+
ParsedSentence = Parsing(Suppy, server)
|
263 |
+
|
264 |
+
for i in ParsedSentence:
|
265 |
+
for j in i:
|
266 |
+
lefts = find_leftmost_S(j)
|
267 |
+
if lefts is not None:
|
268 |
+
LeftMostS = lefts
|
269 |
+
else:
|
270 |
+
LeftMostS = i
|
271 |
+
break
|
272 |
+
|
273 |
+
Labels = [ 'SBAR', 'DT', 'TMP', 'CC']
|
274 |
+
for i in Labels:
|
275 |
+
Temp = Pruning(LeftMostS, i)
|
276 |
+
LeftMostS = Temp
|
277 |
+
|
278 |
+
Trex = Extract(Temp)
|
279 |
+
Kalix = IterativeTrimming(Trex, TopPhrases, 120)
|
280 |
+
|
281 |
+
'''PS = Parsing(Kalix, server)
|
282 |
+
Tk = Pruning(PS, 'SBAR')
|
283 |
+
|
284 |
+
Trex = Extract(Tk)'''
|
285 |
+
|
286 |
+
CompressedSentences.append(Kalix)
|
287 |
+
return CompressedSentences
|
288 |
+
|
289 |
+
# Key Phrase Matching and Ranking
|
290 |
+
|
291 |
+
def SGRMatching(HeadLine, TopPhrases):
|
292 |
+
l, Flag, itre = len(TopPhrases), 0.0, 0
|
293 |
+
for Phrase in TopPhrases:
|
294 |
+
if Phrase in HeadLine:
|
295 |
+
Flag += (l - TopPhrases.index(Phrase)) / l
|
296 |
+
itre += 1
|
297 |
+
return (itre * Flag) / l
|
298 |
+
'''
|
299 |
+
if itre != 0:
|
300 |
+
return Flag / itre
|
301 |
+
else:
|
302 |
+
return -1'''
|
303 |
+
|
304 |
+
def Ranking(CompressedSentences, KeyPhrases):
|
305 |
+
ResultDict = {}
|
306 |
+
for i in CompressedSentences:
|
307 |
+
ResultDict[i] = SGRMatching(i, KeyPhrases)
|
308 |
+
return ResultDict
|
309 |
+
|
310 |
+
# Post Processing using DistilBert
|
311 |
+
|
312 |
+
#
|
313 |
+
# Split text to segments of length 200, with overlap 50
|
314 |
+
#
|
315 |
+
def split_to_segments(wrds, length, overlap):
|
316 |
+
resp = []
|
317 |
+
i = 0
|
318 |
+
while True:
|
319 |
+
wrds_split = wrds[(length * i):((length * (i + 1)) + overlap)]
|
320 |
+
if not wrds_split:
|
321 |
+
break
|
322 |
+
|
323 |
+
resp_obj = {
|
324 |
+
"text": wrds_split,
|
325 |
+
"start_idx": length * i,
|
326 |
+
"end_idx": (length * (i + 1)) + overlap,
|
327 |
+
}
|
328 |
+
|
329 |
+
resp.append(resp_obj)
|
330 |
+
i += 1
|
331 |
+
return resp
|
332 |
+
|
333 |
+
|
334 |
+
#
|
335 |
+
# Punctuate wordpieces
|
336 |
+
#
|
337 |
+
def punctuate_wordpiece(wordpiece, label):
|
338 |
+
if label.startswith('UPPER'):
|
339 |
+
wordpiece = wordpiece.upper()
|
340 |
+
elif label.startswith('Upper'):
|
341 |
+
wordpiece = wordpiece[0].upper() + wordpiece[1:]
|
342 |
+
if label[-1] != '_' and label[-1] != wordpiece[-1]:
|
343 |
+
wordpiece += label[-1]
|
344 |
+
return wordpiece
|
345 |
+
|
346 |
+
|
347 |
+
#
|
348 |
+
# Punctuate text segments (200 words)
|
349 |
+
#
|
350 |
+
def punctuate_segment(wordpieces, word_ids, labels, start_word):
|
351 |
+
result = ''
|
352 |
+
for idx in range(0, len(wordpieces)):
|
353 |
+
if word_ids[idx] == None:
|
354 |
+
continue
|
355 |
+
if word_ids[idx] < start_word:
|
356 |
+
continue
|
357 |
+
wordpiece = punctuate_wordpiece(wordpieces[idx][2:] if wordpieces[idx].startswith('##') else wordpieces[idx],
|
358 |
+
labels[idx])
|
359 |
+
if idx > 0 and len(result) > 0 and word_ids[idx] != word_ids[idx - 1] and result[-1] != '-':
|
360 |
+
result += ' '
|
361 |
+
result += wordpiece
|
362 |
+
return result
|
363 |
+
|
364 |
+
|
365 |
+
#
|
366 |
+
# Tokenize, predict, punctuate text segments (200 words)
|
367 |
+
#
|
368 |
+
def process_segment(words, tokenizer, model, start_word, encoder_max_length):
|
369 |
+
|
370 |
+
tokens = tokenizer(words['text'],
|
371 |
+
padding="max_length",
|
372 |
+
# truncation=True,
|
373 |
+
max_length=encoder_max_length,
|
374 |
+
is_split_into_words=True, return_tensors='pt')
|
375 |
+
|
376 |
+
with torch.no_grad():
|
377 |
+
logits = model(**tokens).logits
|
378 |
+
logits = logits.cpu()
|
379 |
+
predictions = np.argmax(logits, axis=-1)
|
380 |
+
|
381 |
+
wordpieces = tokens.tokens()
|
382 |
+
word_ids = tokens.word_ids()
|
383 |
+
id2label = model.config.id2label
|
384 |
+
labels = [[id2label[p.item()] for p in prediction] for prediction in predictions][0]
|
385 |
+
|
386 |
+
return punctuate_segment(wordpieces, word_ids, labels, start_word)
|
387 |
+
|
388 |
+
|
389 |
+
#
|
390 |
+
# Punctuate text of any length
|
391 |
+
#
|
392 |
+
def punctuate(text, tokenizer, model, encoder_max_length):
|
393 |
+
text = text.lower()
|
394 |
+
text = text.replace('\n', ' ')
|
395 |
+
words = text.split(' ')
|
396 |
+
|
397 |
+
overlap = 50
|
398 |
+
slices = split_to_segments(words, 150, 50)
|
399 |
+
|
400 |
+
result = ""
|
401 |
+
start_word = 0
|
402 |
+
for text in slices:
|
403 |
+
corrected = process_segment(text, tokenizer, model, start_word, encoder_max_length)
|
404 |
+
result += corrected + ' '
|
405 |
+
start_word = overlap
|
406 |
+
return result
|
407 |
+
|
408 |
+
def PostProcess(Sentence):
|
409 |
+
checkpoint = "venkatchoudharyala/Punctuate"
|
410 |
+
tokenizer = DistilBertTokenizerFast.from_pretrained(checkpoint)
|
411 |
+
model = DistilBertForTokenClassification.from_pretrained(checkpoint)
|
412 |
+
encoder_max_length = 256
|
413 |
+
return punctuate(Sentence, tokenizer, model, encoder_max_length)
|
414 |
+
|
415 |
+
def Generate(Article, Server):
|
416 |
+
cleaned_article = re.sub(r'\([^)]*\)', '', Article)
|
417 |
+
|
418 |
+
KeyPhrases = KeyPhraseSGRank(cleaned_article)
|
419 |
+
|
420 |
+
LSG = LeadSentencesOOPS(cleaned_article)
|
421 |
+
LeadingSentences = LSG.leading_sentences()
|
422 |
+
#LeadingSentences = leading_sentences(cleaned_article)
|
423 |
+
#LeadingSentences = get_first_sentences(cleaned_article)
|
424 |
+
print("...Leading Sentences Found...")
|
425 |
+
print(LeadingSentences)
|
426 |
+
|
427 |
+
CompressedSentences = CompressionAlgorithm(LeadingSentences, KeyPhrases, Server)
|
428 |
+
|
429 |
+
ResultDict = Ranking(CompressedSentences, KeyPhrases)
|
430 |
+
|
431 |
+
max_key = max(ResultDict, key=lambda k: ResultDict[k])
|
432 |
+
print("...Scores of Sentences...")
|
433 |
+
print(ResultDict)
|
434 |
+
return PostProcess(max_key)
|
build/lib/headline_gen/__init__.py
ADDED
File without changes
|
dist/headline-gen-2.3.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f5a5397807c25ff55f93a51b2734174a1add5524a6f3f6cd8a192fcd9c94a3df
|
3 |
+
size 6278
|
dist/headline-gen-2.4.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a755616c0f279d974304a1f1ea6bafc1f8170e06ad29a0f64a491ee1e1cab752
|
3 |
+
size 6286
|
dist/headline-gen-2.5.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8b2bc4205713c78e56c31f6d0dce5db2e8a0074c37f2cfcf07f4da7b32b85ed5
|
3 |
+
size 6225
|
dist/headline-gen-2.6.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7cf39696ea27452e3b5efab02eaa8031cfec599cb8271990b8fe37c8ab1015a7
|
3 |
+
size 6249
|
dist/headline_gen-2.3-py3-none-any.whl
ADDED
Binary file (6.37 kB). View file
|
|
dist/headline_gen-2.4-py3-none-any.whl
ADDED
Binary file (6.37 kB). View file
|
|
dist/headline_gen-2.5-py3-none-any.whl
ADDED
Binary file (6.33 kB). View file
|
|
dist/headline_gen-2.6-py3-none-any.whl
ADDED
Binary file (6.33 kB). View file
|
|
headline_gen.egg-info/PKG-INFO
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.1
|
2 |
+
Name: headline-gen
|
3 |
+
Version: 2.6
|
4 |
+
Summary: Provides functionality to generate headlines from articles using natural language processing techniques.
|
5 |
+
Author: venkatchoudharyala
|
6 |
+
Author-email: [email protected]
|
7 |
+
Requires-Python: >=3.6
|
8 |
+
Description-Content-Type: text/markdown
|
9 |
+
|
10 |
+
|
11 |
+
# Headline Generation Package
|
12 |
+
|
13 |
+
This is a Python package for generating headlines from Articles.
|
14 |
+
|
15 |
+
## Installation
|
16 |
+
|
17 |
+
You can install the package using pip:
|
18 |
+
|
19 |
+
```bash
|
20 |
+
pip install headline-gen
|
21 |
+
```
|
22 |
+
|
23 |
+
## Usage
|
24 |
+
|
25 |
+
```python
|
26 |
+
from headline_gen.Control import ServerCntrl, Generate
|
27 |
+
|
28 |
+
# Run this once to start the server
|
29 |
+
Server = ServerCntrl("Start")
|
30 |
+
|
31 |
+
# Generate headline from article text
|
32 |
+
headline = Generate("Your article text goes here...", Server)
|
33 |
+
print(headline)
|
34 |
+
|
35 |
+
# Stop the server when done
|
36 |
+
ServerCntrl("Stop", Server)
|
37 |
+
```
|
38 |
+
|
39 |
+
## Description
|
40 |
+
|
41 |
+
This package provides functionality to generate headlines from article text using natural language processing techniques.
|
42 |
+
|
43 |
+
## Usage Instructions
|
44 |
+
|
45 |
+
1. Import the `ServerCntrl` and `Generate` functions from the `Control` module.
|
46 |
+
2. Start the server using `ServerCntrl("Start")`. This only needs to be done once.
|
47 |
+
3. Generate headlines using the `Generate` function, passing the article text as an argument.
|
48 |
+
4. Stop the server when done using `ServerCntrl("Stop", Server)`.
|
49 |
+
|
50 |
+
## New Release Features (v2.6) and Bug Fixes
|
51 |
+
|
52 |
+
1. Fixed a corner case issue causing a ZeroDivisionError when processing irregular parameters for phrase extraction. The package now gracefully handles such scenarios without disrupting functionality.
|
53 |
+
2. Renamed the function `ServerInit` to `ServerCntrl` for improved clarity and consistency within the codebase.
|
54 |
+
3. Additionally, streamlined the dependency management by directly including `en_core_web_sm` in the downloader module.
|
55 |
+
4. Output made more Comprehensive.
|
headline_gen.egg-info/SOURCES.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
README.md
|
2 |
+
setup.py
|
3 |
+
headline_gen/Control.py
|
4 |
+
headline_gen/__init__.py
|
5 |
+
headline_gen.egg-info/PKG-INFO
|
6 |
+
headline_gen.egg-info/SOURCES.txt
|
7 |
+
headline_gen.egg-info/dependency_links.txt
|
8 |
+
headline_gen.egg-info/requires.txt
|
9 |
+
headline_gen.egg-info/top_level.txt
|
headline_gen.egg-info/dependency_links.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
headline_gen.egg-info/requires.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
requests
|
2 |
+
nltk
|
3 |
+
numpy
|
4 |
+
scipy==1.12.0
|
5 |
+
gensim
|
6 |
+
networkx
|
7 |
+
textacy
|
8 |
+
transformers
|
9 |
+
torch
|
10 |
+
spacy-download
|
headline_gen.egg-info/top_level.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
headline_gen
|
headline_gen/Control.py
ADDED
@@ -0,0 +1,434 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import zipfile
|
3 |
+
import os
|
4 |
+
import nltk
|
5 |
+
|
6 |
+
from nltk.parse.corenlp import CoreNLPServer
|
7 |
+
|
8 |
+
from spacy_download import load_spacy
|
9 |
+
import textacy
|
10 |
+
from textacy import *
|
11 |
+
import string
|
12 |
+
|
13 |
+
import re
|
14 |
+
import numpy as np
|
15 |
+
from nltk.tokenize import sent_tokenize
|
16 |
+
from nltk.corpus import stopwords
|
17 |
+
from gensim.models import Word2Vec
|
18 |
+
from scipy.spatial import distance
|
19 |
+
import networkx as nx
|
20 |
+
|
21 |
+
#import string
|
22 |
+
from nltk.parse.corenlp import CoreNLPParser
|
23 |
+
from nltk.tree.tree import Tree
|
24 |
+
|
25 |
+
import torch
|
26 |
+
import numpy as np
|
27 |
+
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification
|
28 |
+
|
29 |
+
|
30 |
+
#import re
|
31 |
+
#import os
|
32 |
+
|
33 |
+
# Downloader
|
34 |
+
|
35 |
+
def Downloader():
|
36 |
+
directory_path = "Parser/stanford-corenlp-4.5.6"
|
37 |
+
if os.path.exists(directory_path) and os.listdir(directory_path):
|
38 |
+
pass
|
39 |
+
else:
|
40 |
+
nlp = load_spacy("en_core_web_sm")
|
41 |
+
nltk.download('punkt')
|
42 |
+
nltk.download('stopwords')
|
43 |
+
url = "https://nlp.stanford.edu/software/stanford-corenlp-4.5.6.zip"
|
44 |
+
|
45 |
+
filename = "stanford-corenlp-4.5.6.zip"
|
46 |
+
directory = "./Parser/"
|
47 |
+
|
48 |
+
os.makedirs(directory, exist_ok=True)
|
49 |
+
|
50 |
+
response = requests.get(url)
|
51 |
+
|
52 |
+
if response.status_code == 200:
|
53 |
+
with open(os.path.join(directory, filename), 'wb') as f:
|
54 |
+
f.write(response.content)
|
55 |
+
print("Download successful.")
|
56 |
+
|
57 |
+
with zipfile.ZipFile(os.path.join(directory, filename), 'r') as zip_ref:
|
58 |
+
zip_ref.extractall(directory)
|
59 |
+
print("Extraction successful.")
|
60 |
+
else:
|
61 |
+
print("Failed to download file.")
|
62 |
+
|
63 |
+
def ServerCntrl(Mode, Server = None):
|
64 |
+
Path = 'Parser/'
|
65 |
+
os.environ['CLASSPATH'] = os.path.join(Path, 'stanford-corenlp-4.5.6')
|
66 |
+
|
67 |
+
directory_path = "Parser/stanford-corenlp-4.5.6"
|
68 |
+
if os.path.exists(directory_path) and os.listdir(directory_path):
|
69 |
+
if Mode == "Start":
|
70 |
+
server = CoreNLPServer()
|
71 |
+
server.start()
|
72 |
+
return server
|
73 |
+
elif Mode == "Stop":
|
74 |
+
if Server is None:
|
75 |
+
print("No Server Object Provided")
|
76 |
+
else:
|
77 |
+
Server.stop()
|
78 |
+
else:
|
79 |
+
print("Un defined Operation")
|
80 |
+
else:
|
81 |
+
print("Parser Files Not Found")
|
82 |
+
print("Attempting to Install Parser Files (This may take a Min or Two!!)")
|
83 |
+
Downloader()
|
84 |
+
if Mode == "Start":
|
85 |
+
server = ServerCntrl("Start")
|
86 |
+
return server
|
87 |
+
|
88 |
+
# Key Phrase Extraction
|
89 |
+
|
90 |
+
def remove_punctuation(text):
|
91 |
+
table = str.maketrans('', '', string.punctuation)
|
92 |
+
return text.translate(table)
|
93 |
+
|
94 |
+
def KeyPhraseSGRank(Article):
|
95 |
+
en = textacy.load_spacy_lang("en_core_web_sm")
|
96 |
+
|
97 |
+
Article = remove_punctuation(Article)
|
98 |
+
|
99 |
+
doc = textacy.make_spacy_doc(Article, lang=en)
|
100 |
+
|
101 |
+
TopPhrases = [kps for kps, weights in textacy.extract.keyterms.sgrank(doc, ngrams = (1, 3), topn=1.0)]
|
102 |
+
if len(TopPhrases) != 0:
|
103 |
+
print("...Key Phrases Found...")
|
104 |
+
print(TopPhrases)
|
105 |
+
return TopPhrases
|
106 |
+
else:
|
107 |
+
print("No Specific Key Phrases Found, Terminating the Execution...")
|
108 |
+
exit()
|
109 |
+
|
110 |
+
# Lead Sentence Extraction
|
111 |
+
|
112 |
+
class LeadSentencesOOPS:
|
113 |
+
def __init__(self, df):
|
114 |
+
self.df = df
|
115 |
+
self.sentences = sent_tokenize(self.df)
|
116 |
+
|
117 |
+
def pre_process(self):
|
118 |
+
sentences_clean = [re.sub(r'[^\w\s]', '', sentence.lower()) for sentence in self.sentences]
|
119 |
+
stop_words = stopwords.words('english')
|
120 |
+
sentence_tokens = [[words for words in sentence.split(' ') if words not in stop_words] for sentence in sentences_clean]
|
121 |
+
return sentence_tokens
|
122 |
+
|
123 |
+
def count_paragraphs(self):
|
124 |
+
text=self.df
|
125 |
+
paragraphs = re.split(r'\n\s*\n', text)
|
126 |
+
return (paragraphs,len(paragraphs))
|
127 |
+
|
128 |
+
def word2vec(self):
|
129 |
+
sentence_tokens = self.pre_process()
|
130 |
+
w2v = Word2Vec(sentence_tokens, vector_size=1, min_count=1, epochs=1500)
|
131 |
+
sentence_embeddings = []
|
132 |
+
max_len = max(len(tokens) for tokens in sentence_tokens)
|
133 |
+
for words in sentence_tokens:
|
134 |
+
embedding = [w2v.wv[word] for word in words]
|
135 |
+
padding_length = max_len - len(embedding)
|
136 |
+
padded_embedding = np.pad(embedding, [(0, padding_length), (0, 0)], mode='constant')
|
137 |
+
sentence_embeddings.append(padded_embedding)
|
138 |
+
return sentence_embeddings
|
139 |
+
|
140 |
+
def similarity_matrix(self):
|
141 |
+
sentence_tokens = self.pre_process()
|
142 |
+
sentence_embeddings = self.word2vec()
|
143 |
+
similarity_matrix = np.zeros([len(sentence_tokens), len(sentence_tokens)])
|
144 |
+
for i, row_embedding in enumerate(sentence_embeddings):
|
145 |
+
for j, column_embedding in enumerate(sentence_embeddings):
|
146 |
+
similarity_matrix[i][j] = 1 - distance.cosine(row_embedding.ravel(), column_embedding.ravel())
|
147 |
+
return similarity_matrix
|
148 |
+
|
149 |
+
def num_of_leadingsentences(self):
|
150 |
+
num_sentences = len(self.sentences)
|
151 |
+
if num_sentences < 5:
|
152 |
+
top = 1
|
153 |
+
elif num_sentences < 10:
|
154 |
+
top = 2
|
155 |
+
elif num_sentences < 25:
|
156 |
+
top = 4
|
157 |
+
elif num_sentences < 50:
|
158 |
+
top = 9
|
159 |
+
elif num_sentences < 100:
|
160 |
+
top = 18
|
161 |
+
elif num_sentences < 200:
|
162 |
+
top = 25
|
163 |
+
elif num_sentences >= 201:
|
164 |
+
top = 40
|
165 |
+
return top
|
166 |
+
|
167 |
+
def text_rank(self,num_sentences_to_extract):
|
168 |
+
li=[]
|
169 |
+
similarity_matrixs = self.similarity_matrix()
|
170 |
+
nx_graph = nx.from_numpy_array(similarity_matrixs)
|
171 |
+
scores = nx.pagerank(nx_graph)
|
172 |
+
top_sentence = {sentence: scores[index] for index, sentence in enumerate(self.sentences)}
|
173 |
+
top = dict(sorted(top_sentence.items(), key=lambda x: x[1], reverse=True)[:num_sentences_to_extract])
|
174 |
+
for sent in self.sentences:
|
175 |
+
if sent in top.keys():
|
176 |
+
li.append(sent)
|
177 |
+
return li
|
178 |
+
|
179 |
+
def leading_sentences(self):
|
180 |
+
article_info = self.count_paragraphs()
|
181 |
+
leading_sentences=[]
|
182 |
+
#if there is only one para in article then num_of_leading sentences are selected based on fixed constant
|
183 |
+
if article_info[1] <= 3:
|
184 |
+
num_sentences_to_extract=self.num_of_leadingsentences()
|
185 |
+
LSG_article = LeadSentencesOOPS(str(article_info[0]))
|
186 |
+
leading_sentences.extend(LSG_article.text_rank(num_sentences_to_extract))
|
187 |
+
#leading_sentences_corpus.append(leading_sentences)
|
188 |
+
else:
|
189 |
+
num_sentences_to_extract=1 #if there are more than one paras in article
|
190 |
+
paragraphs = article_info[0]
|
191 |
+
#print("num_paras: ",paragraphs)
|
192 |
+
#extracting one leading sentence from each paragraph
|
193 |
+
for para in paragraphs:
|
194 |
+
LSG = LeadSentencesOOPS(para)
|
195 |
+
output = LSG.text_rank(num_sentences_to_extract)
|
196 |
+
leading_sentences.extend(output)
|
197 |
+
#extractig leading sentence from entire article
|
198 |
+
LSG_article = LeadSentencesOOPS(para)
|
199 |
+
leading_sentences.extend(LSG_article.text_rank(num_sentences_to_extract))
|
200 |
+
|
201 |
+
return leading_sentences
|
202 |
+
|
203 |
+
# Parsing and Compression Algo
|
204 |
+
|
205 |
+
def remove_punctuation(text):
|
206 |
+
table = str.maketrans('', '', string.punctuation)
|
207 |
+
return text.translate(table)
|
208 |
+
|
209 |
+
def Parsing(Sentence, Server):
|
210 |
+
parser = CoreNLPParser(url=Server.url)
|
211 |
+
return next(parser.raw_parse(Sentence))
|
212 |
+
|
213 |
+
def find_leftmost_S(tree):
|
214 |
+
if isinstance(tree, str): # Terminal node
|
215 |
+
return None
|
216 |
+
elif tree.label() == 'S': # Found leftmost S node
|
217 |
+
return tree
|
218 |
+
else:
|
219 |
+
for subtree in tree:
|
220 |
+
result = find_leftmost_S(subtree)
|
221 |
+
if result is not None:
|
222 |
+
return result
|
223 |
+
|
224 |
+
def Pruning(tree, Label):
|
225 |
+
if isinstance(tree, str):
|
226 |
+
return tree
|
227 |
+
if tree.height() > 0:
|
228 |
+
filtered_children = [Pruning(child, Label) for child in tree if (isinstance(child, str) or child.height() > 0) and (isinstance(child, str) or child.label() != Label)]
|
229 |
+
return Tree(tree.label(), filtered_children)
|
230 |
+
else:
|
231 |
+
return tree
|
232 |
+
|
233 |
+
def IterativeTrimming(HeadLine, SGRankList, Threshold):
|
234 |
+
if len(HeadLine) > Threshold:
|
235 |
+
if len(SGRankList) > 0:
|
236 |
+
ptr = SGRankList[-1]
|
237 |
+
else:
|
238 |
+
return HeadLine
|
239 |
+
if HeadLine.find(ptr) > 0:
|
240 |
+
if HeadLine[HeadLine.find(ptr) - 1] != ' ':
|
241 |
+
HeadLine = HeadLine.replace(ptr, ":", 1)
|
242 |
+
else:
|
243 |
+
HeadLine = HeadLine.replace(' ' + ptr, "", 1)
|
244 |
+
else:
|
245 |
+
HeadLine = HeadLine.replace(ptr + ' ', "", 1)
|
246 |
+
return IterativeTrimming(HeadLine, SGRankList[: len(SGRankList) - 1], Threshold)
|
247 |
+
else:
|
248 |
+
return HeadLine
|
249 |
+
|
250 |
+
def Extract(Treex):
|
251 |
+
k = Treex.leaves()
|
252 |
+
Trex = ''
|
253 |
+
for i in k:
|
254 |
+
Trex += i + ' '
|
255 |
+
return Trex
|
256 |
+
|
257 |
+
def CompressionAlgorithm(LeadSents, TopPhrases, server):
|
258 |
+
CompressedSentences = []
|
259 |
+
for i in LeadSents:
|
260 |
+
Suppy = remove_punctuation(i)
|
261 |
+
|
262 |
+
ParsedSentence = Parsing(Suppy, server)
|
263 |
+
|
264 |
+
for i in ParsedSentence:
|
265 |
+
for j in i:
|
266 |
+
lefts = find_leftmost_S(j)
|
267 |
+
if lefts is not None:
|
268 |
+
LeftMostS = lefts
|
269 |
+
else:
|
270 |
+
LeftMostS = i
|
271 |
+
break
|
272 |
+
|
273 |
+
Labels = [ 'SBAR', 'DT', 'TMP', 'CC']
|
274 |
+
for i in Labels:
|
275 |
+
Temp = Pruning(LeftMostS, i)
|
276 |
+
LeftMostS = Temp
|
277 |
+
|
278 |
+
Trex = Extract(Temp)
|
279 |
+
Kalix = IterativeTrimming(Trex, TopPhrases, 120)
|
280 |
+
|
281 |
+
'''PS = Parsing(Kalix, server)
|
282 |
+
Tk = Pruning(PS, 'SBAR')
|
283 |
+
|
284 |
+
Trex = Extract(Tk)'''
|
285 |
+
|
286 |
+
CompressedSentences.append(Kalix)
|
287 |
+
return CompressedSentences
|
288 |
+
|
289 |
+
# Key Phrase Matching and Ranking
|
290 |
+
|
291 |
+
def SGRMatching(HeadLine, TopPhrases):
|
292 |
+
l, Flag, itre = len(TopPhrases), 0.0, 0
|
293 |
+
for Phrase in TopPhrases:
|
294 |
+
if Phrase in HeadLine:
|
295 |
+
Flag += (l - TopPhrases.index(Phrase)) / l
|
296 |
+
itre += 1
|
297 |
+
return (itre * Flag) / l
|
298 |
+
'''
|
299 |
+
if itre != 0:
|
300 |
+
return Flag / itre
|
301 |
+
else:
|
302 |
+
return -1'''
|
303 |
+
|
304 |
+
def Ranking(CompressedSentences, KeyPhrases):
|
305 |
+
ResultDict = {}
|
306 |
+
for i in CompressedSentences:
|
307 |
+
ResultDict[i] = SGRMatching(i, KeyPhrases)
|
308 |
+
return ResultDict
|
309 |
+
|
310 |
+
# Post Processing using DistilBert
|
311 |
+
|
312 |
+
#
|
313 |
+
# Split text to segments of length 200, with overlap 50
|
314 |
+
#
|
315 |
+
def split_to_segments(wrds, length, overlap):
|
316 |
+
resp = []
|
317 |
+
i = 0
|
318 |
+
while True:
|
319 |
+
wrds_split = wrds[(length * i):((length * (i + 1)) + overlap)]
|
320 |
+
if not wrds_split:
|
321 |
+
break
|
322 |
+
|
323 |
+
resp_obj = {
|
324 |
+
"text": wrds_split,
|
325 |
+
"start_idx": length * i,
|
326 |
+
"end_idx": (length * (i + 1)) + overlap,
|
327 |
+
}
|
328 |
+
|
329 |
+
resp.append(resp_obj)
|
330 |
+
i += 1
|
331 |
+
return resp
|
332 |
+
|
333 |
+
|
334 |
+
#
|
335 |
+
# Punctuate wordpieces
|
336 |
+
#
|
337 |
+
def punctuate_wordpiece(wordpiece, label):
|
338 |
+
if label.startswith('UPPER'):
|
339 |
+
wordpiece = wordpiece.upper()
|
340 |
+
elif label.startswith('Upper'):
|
341 |
+
wordpiece = wordpiece[0].upper() + wordpiece[1:]
|
342 |
+
if label[-1] != '_' and label[-1] != wordpiece[-1]:
|
343 |
+
wordpiece += label[-1]
|
344 |
+
return wordpiece
|
345 |
+
|
346 |
+
|
347 |
+
#
|
348 |
+
# Punctuate text segments (200 words)
|
349 |
+
#
|
350 |
+
def punctuate_segment(wordpieces, word_ids, labels, start_word):
|
351 |
+
result = ''
|
352 |
+
for idx in range(0, len(wordpieces)):
|
353 |
+
if word_ids[idx] == None:
|
354 |
+
continue
|
355 |
+
if word_ids[idx] < start_word:
|
356 |
+
continue
|
357 |
+
wordpiece = punctuate_wordpiece(wordpieces[idx][2:] if wordpieces[idx].startswith('##') else wordpieces[idx],
|
358 |
+
labels[idx])
|
359 |
+
if idx > 0 and len(result) > 0 and word_ids[idx] != word_ids[idx - 1] and result[-1] != '-':
|
360 |
+
result += ' '
|
361 |
+
result += wordpiece
|
362 |
+
return result
|
363 |
+
|
364 |
+
|
365 |
+
#
|
366 |
+
# Tokenize, predict, punctuate text segments (200 words)
|
367 |
+
#
|
368 |
+
def process_segment(words, tokenizer, model, start_word, encoder_max_length):
|
369 |
+
|
370 |
+
tokens = tokenizer(words['text'],
|
371 |
+
padding="max_length",
|
372 |
+
# truncation=True,
|
373 |
+
max_length=encoder_max_length,
|
374 |
+
is_split_into_words=True, return_tensors='pt')
|
375 |
+
|
376 |
+
with torch.no_grad():
|
377 |
+
logits = model(**tokens).logits
|
378 |
+
logits = logits.cpu()
|
379 |
+
predictions = np.argmax(logits, axis=-1)
|
380 |
+
|
381 |
+
wordpieces = tokens.tokens()
|
382 |
+
word_ids = tokens.word_ids()
|
383 |
+
id2label = model.config.id2label
|
384 |
+
labels = [[id2label[p.item()] for p in prediction] for prediction in predictions][0]
|
385 |
+
|
386 |
+
return punctuate_segment(wordpieces, word_ids, labels, start_word)
|
387 |
+
|
388 |
+
|
389 |
+
#
|
390 |
+
# Punctuate text of any length
|
391 |
+
#
|
392 |
+
def punctuate(text, tokenizer, model, encoder_max_length):
|
393 |
+
text = text.lower()
|
394 |
+
text = text.replace('\n', ' ')
|
395 |
+
words = text.split(' ')
|
396 |
+
|
397 |
+
overlap = 50
|
398 |
+
slices = split_to_segments(words, 150, 50)
|
399 |
+
|
400 |
+
result = ""
|
401 |
+
start_word = 0
|
402 |
+
for text in slices:
|
403 |
+
corrected = process_segment(text, tokenizer, model, start_word, encoder_max_length)
|
404 |
+
result += corrected + ' '
|
405 |
+
start_word = overlap
|
406 |
+
return result
|
407 |
+
|
408 |
+
def PostProcess(Sentence):
|
409 |
+
checkpoint = "venkatchoudharyala/Punctuate"
|
410 |
+
tokenizer = DistilBertTokenizerFast.from_pretrained(checkpoint)
|
411 |
+
model = DistilBertForTokenClassification.from_pretrained(checkpoint)
|
412 |
+
encoder_max_length = 256
|
413 |
+
return punctuate(Sentence, tokenizer, model, encoder_max_length)
|
414 |
+
|
415 |
+
def Generate(Article, Server):
|
416 |
+
cleaned_article = re.sub(r'\([^)]*\)', '', Article)
|
417 |
+
|
418 |
+
KeyPhrases = KeyPhraseSGRank(cleaned_article)
|
419 |
+
|
420 |
+
LSG = LeadSentencesOOPS(cleaned_article)
|
421 |
+
LeadingSentences = LSG.leading_sentences()
|
422 |
+
#LeadingSentences = leading_sentences(cleaned_article)
|
423 |
+
#LeadingSentences = get_first_sentences(cleaned_article)
|
424 |
+
print("...Leading Sentences Found...")
|
425 |
+
print(LeadingSentences)
|
426 |
+
|
427 |
+
CompressedSentences = CompressionAlgorithm(LeadingSentences, KeyPhrases, Server)
|
428 |
+
|
429 |
+
ResultDict = Ranking(CompressedSentences, KeyPhrases)
|
430 |
+
|
431 |
+
max_key = max(ResultDict, key=lambda k: ResultDict[k])
|
432 |
+
print("...Scores of Sentences...")
|
433 |
+
print(ResultDict)
|
434 |
+
return PostProcess(max_key)
|
headline_gen/__init__.py
ADDED
File without changes
|
setup.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setuptools import setup, find_packages
|
2 |
+
|
3 |
+
setup(
|
4 |
+
name='headline-gen',
|
5 |
+
version='2.6',
|
6 |
+
author='venkatchoudharyala',
|
7 |
+
author_email='[email protected]',
|
8 |
+
description='Provides functionality to generate headlines from articles using natural language processing techniques.',
|
9 |
+
long_description=open('README.md').read(), # Read the contents of README.md
|
10 |
+
long_description_content_type='text/markdown', # Specify the content type of the l
|
11 |
+
install_requires=[
|
12 |
+
'requests',
|
13 |
+
'nltk',
|
14 |
+
'numpy',
|
15 |
+
'scipy==1.12.0',
|
16 |
+
'gensim',
|
17 |
+
'networkx',
|
18 |
+
'textacy',
|
19 |
+
'transformers',
|
20 |
+
'torch',
|
21 |
+
'spacy-download'
|
22 |
+
],
|
23 |
+
python_requires='>=3.6',
|
24 |
+
)
|