venkatchoudharyala commited on
Commit
1bec2ff
·
verified ·
1 Parent(s): ad4a935

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Headline Generation Package
3
+
4
+ This is a Python package for generating headlines from Articles.
5
+
6
+ ## Installation
7
+
8
+ You can install the package using pip:
9
+
10
+ ```bash
11
+ pip install headline-gen
12
+ ```
13
+
14
+ ## Usage
15
+
16
+ ```python
17
+ from headline_gen.Control import ServerCntrl, Generate
18
+
19
+ # Run this once to start the server
20
+ Server = ServerCntrl("Start")
21
+
22
+ # Generate headline from article text
23
+ headline = Generate("Your article text goes here...", Server)
24
+ print(headline)
25
+
26
+ # Stop the server when done
27
+ ServerCntrl("Stop", Server)
28
+ ```
29
+
30
+ ## Description
31
+
32
+ This package provides functionality to generate headlines from article text using natural language processing techniques.
33
+
34
+ ## Usage Instructions
35
+
36
+ 1. Import the `ServerCntrl` and `Generate` functions from the `Control` module.
37
+ 2. Start the server using `ServerCntrl("Start")`. This only needs to be done once.
38
+ 3. Generate headlines using the `Generate` function, passing the article text as an argument.
39
+ 4. Stop the server when done using `ServerCntrl("Stop", Server)`.
40
+
41
+ ## New Release Features (v2.6) and Bug Fixes
42
+
43
+ 1. Fixed a corner case issue causing a ZeroDivisionError when processing irregular parameters for phrase extraction. The package now gracefully handles such scenarios without disrupting functionality.
44
+ 2. Renamed the function `ServerInit` to `ServerCntrl` for improved clarity and consistency within the codebase.
45
+ 3. Additionally, streamlined the dependency management by directly including `en_core_web_sm` in the downloader module.
46
+ 4. Output made more Comprehensive.
build/lib/headline_gen/Control.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import zipfile
3
+ import os
4
+ import nltk
5
+
6
+ from nltk.parse.corenlp import CoreNLPServer
7
+
8
+ from spacy_download import load_spacy
9
+ import textacy
10
+ from textacy import *
11
+ import string
12
+
13
+ import re
14
+ import numpy as np
15
+ from nltk.tokenize import sent_tokenize
16
+ from nltk.corpus import stopwords
17
+ from gensim.models import Word2Vec
18
+ from scipy.spatial import distance
19
+ import networkx as nx
20
+
21
+ #import string
22
+ from nltk.parse.corenlp import CoreNLPParser
23
+ from nltk.tree.tree import Tree
24
+
25
+ import torch
26
+ import numpy as np
27
+ from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification
28
+
29
+
30
+ #import re
31
+ #import os
32
+
33
+ # Downloader
34
+
35
+ def Downloader():
36
+ directory_path = "Parser/stanford-corenlp-4.5.6"
37
+ if os.path.exists(directory_path) and os.listdir(directory_path):
38
+ pass
39
+ else:
40
+ nlp = load_spacy("en_core_web_sm")
41
+ nltk.download('punkt')
42
+ nltk.download('stopwords')
43
+ url = "https://nlp.stanford.edu/software/stanford-corenlp-4.5.6.zip"
44
+
45
+ filename = "stanford-corenlp-4.5.6.zip"
46
+ directory = "./Parser/"
47
+
48
+ os.makedirs(directory, exist_ok=True)
49
+
50
+ response = requests.get(url)
51
+
52
+ if response.status_code == 200:
53
+ with open(os.path.join(directory, filename), 'wb') as f:
54
+ f.write(response.content)
55
+ print("Download successful.")
56
+
57
+ with zipfile.ZipFile(os.path.join(directory, filename), 'r') as zip_ref:
58
+ zip_ref.extractall(directory)
59
+ print("Extraction successful.")
60
+ else:
61
+ print("Failed to download file.")
62
+
63
+ def ServerCntrl(Mode, Server = None):
64
+ Path = 'Parser/'
65
+ os.environ['CLASSPATH'] = os.path.join(Path, 'stanford-corenlp-4.5.6')
66
+
67
+ directory_path = "Parser/stanford-corenlp-4.5.6"
68
+ if os.path.exists(directory_path) and os.listdir(directory_path):
69
+ if Mode == "Start":
70
+ server = CoreNLPServer()
71
+ server.start()
72
+ return server
73
+ elif Mode == "Stop":
74
+ if Server is None:
75
+ print("No Server Object Provided")
76
+ else:
77
+ Server.stop()
78
+ else:
79
+ print("Un defined Operation")
80
+ else:
81
+ print("Parser Files Not Found")
82
+ print("Attempting to Install Parser Files (This may take a Min or Two!!)")
83
+ Downloader()
84
+ if Mode == "Start":
85
+ server = ServerCntrl("Start")
86
+ return server
87
+
88
+ # Key Phrase Extraction
89
+
90
+ def remove_punctuation(text):
91
+ table = str.maketrans('', '', string.punctuation)
92
+ return text.translate(table)
93
+
94
+ def KeyPhraseSGRank(Article):
95
+ en = textacy.load_spacy_lang("en_core_web_sm")
96
+
97
+ Article = remove_punctuation(Article)
98
+
99
+ doc = textacy.make_spacy_doc(Article, lang=en)
100
+
101
+ TopPhrases = [kps for kps, weights in textacy.extract.keyterms.sgrank(doc, ngrams = (1, 3), topn=1.0)]
102
+ if len(TopPhrases) != 0:
103
+ print("...Key Phrases Found...")
104
+ print(TopPhrases)
105
+ return TopPhrases
106
+ else:
107
+ print("No Specific Key Phrases Found, Terminating the Execution...")
108
+ exit()
109
+
110
+ # Lead Sentence Extraction
111
+
112
+ class LeadSentencesOOPS:
113
+ def __init__(self, df):
114
+ self.df = df
115
+ self.sentences = sent_tokenize(self.df)
116
+
117
+ def pre_process(self):
118
+ sentences_clean = [re.sub(r'[^\w\s]', '', sentence.lower()) for sentence in self.sentences]
119
+ stop_words = stopwords.words('english')
120
+ sentence_tokens = [[words for words in sentence.split(' ') if words not in stop_words] for sentence in sentences_clean]
121
+ return sentence_tokens
122
+
123
+ def count_paragraphs(self):
124
+ text=self.df
125
+ paragraphs = re.split(r'\n\s*\n', text)
126
+ return (paragraphs,len(paragraphs))
127
+
128
+ def word2vec(self):
129
+ sentence_tokens = self.pre_process()
130
+ w2v = Word2Vec(sentence_tokens, vector_size=1, min_count=1, epochs=1500)
131
+ sentence_embeddings = []
132
+ max_len = max(len(tokens) for tokens in sentence_tokens)
133
+ for words in sentence_tokens:
134
+ embedding = [w2v.wv[word] for word in words]
135
+ padding_length = max_len - len(embedding)
136
+ padded_embedding = np.pad(embedding, [(0, padding_length), (0, 0)], mode='constant')
137
+ sentence_embeddings.append(padded_embedding)
138
+ return sentence_embeddings
139
+
140
+ def similarity_matrix(self):
141
+ sentence_tokens = self.pre_process()
142
+ sentence_embeddings = self.word2vec()
143
+ similarity_matrix = np.zeros([len(sentence_tokens), len(sentence_tokens)])
144
+ for i, row_embedding in enumerate(sentence_embeddings):
145
+ for j, column_embedding in enumerate(sentence_embeddings):
146
+ similarity_matrix[i][j] = 1 - distance.cosine(row_embedding.ravel(), column_embedding.ravel())
147
+ return similarity_matrix
148
+
149
+ def num_of_leadingsentences(self):
150
+ num_sentences = len(self.sentences)
151
+ if num_sentences < 5:
152
+ top = 1
153
+ elif num_sentences < 10:
154
+ top = 2
155
+ elif num_sentences < 25:
156
+ top = 4
157
+ elif num_sentences < 50:
158
+ top = 9
159
+ elif num_sentences < 100:
160
+ top = 18
161
+ elif num_sentences < 200:
162
+ top = 25
163
+ elif num_sentences >= 201:
164
+ top = 40
165
+ return top
166
+
167
+ def text_rank(self,num_sentences_to_extract):
168
+ li=[]
169
+ similarity_matrixs = self.similarity_matrix()
170
+ nx_graph = nx.from_numpy_array(similarity_matrixs)
171
+ scores = nx.pagerank(nx_graph)
172
+ top_sentence = {sentence: scores[index] for index, sentence in enumerate(self.sentences)}
173
+ top = dict(sorted(top_sentence.items(), key=lambda x: x[1], reverse=True)[:num_sentences_to_extract])
174
+ for sent in self.sentences:
175
+ if sent in top.keys():
176
+ li.append(sent)
177
+ return li
178
+
179
+ def leading_sentences(self):
180
+ article_info = self.count_paragraphs()
181
+ leading_sentences=[]
182
+ #if there is only one para in article then num_of_leading sentences are selected based on fixed constant
183
+ if article_info[1] <= 3:
184
+ num_sentences_to_extract=self.num_of_leadingsentences()
185
+ LSG_article = LeadSentencesOOPS(str(article_info[0]))
186
+ leading_sentences.extend(LSG_article.text_rank(num_sentences_to_extract))
187
+ #leading_sentences_corpus.append(leading_sentences)
188
+ else:
189
+ num_sentences_to_extract=1 #if there are more than one paras in article
190
+ paragraphs = article_info[0]
191
+ #print("num_paras: ",paragraphs)
192
+ #extracting one leading sentence from each paragraph
193
+ for para in paragraphs:
194
+ LSG = LeadSentencesOOPS(para)
195
+ output = LSG.text_rank(num_sentences_to_extract)
196
+ leading_sentences.extend(output)
197
+ #extractig leading sentence from entire article
198
+ LSG_article = LeadSentencesOOPS(para)
199
+ leading_sentences.extend(LSG_article.text_rank(num_sentences_to_extract))
200
+
201
+ return leading_sentences
202
+
203
+ # Parsing and Compression Algo
204
+
205
+ def remove_punctuation(text):
206
+ table = str.maketrans('', '', string.punctuation)
207
+ return text.translate(table)
208
+
209
+ def Parsing(Sentence, Server):
210
+ parser = CoreNLPParser(url=Server.url)
211
+ return next(parser.raw_parse(Sentence))
212
+
213
+ def find_leftmost_S(tree):
214
+ if isinstance(tree, str): # Terminal node
215
+ return None
216
+ elif tree.label() == 'S': # Found leftmost S node
217
+ return tree
218
+ else:
219
+ for subtree in tree:
220
+ result = find_leftmost_S(subtree)
221
+ if result is not None:
222
+ return result
223
+
224
+ def Pruning(tree, Label):
225
+ if isinstance(tree, str):
226
+ return tree
227
+ if tree.height() > 0:
228
+ filtered_children = [Pruning(child, Label) for child in tree if (isinstance(child, str) or child.height() > 0) and (isinstance(child, str) or child.label() != Label)]
229
+ return Tree(tree.label(), filtered_children)
230
+ else:
231
+ return tree
232
+
233
+ def IterativeTrimming(HeadLine, SGRankList, Threshold):
234
+ if len(HeadLine) > Threshold:
235
+ if len(SGRankList) > 0:
236
+ ptr = SGRankList[-1]
237
+ else:
238
+ return HeadLine
239
+ if HeadLine.find(ptr) > 0:
240
+ if HeadLine[HeadLine.find(ptr) - 1] != ' ':
241
+ HeadLine = HeadLine.replace(ptr, ":", 1)
242
+ else:
243
+ HeadLine = HeadLine.replace(' ' + ptr, "", 1)
244
+ else:
245
+ HeadLine = HeadLine.replace(ptr + ' ', "", 1)
246
+ return IterativeTrimming(HeadLine, SGRankList[: len(SGRankList) - 1], Threshold)
247
+ else:
248
+ return HeadLine
249
+
250
+ def Extract(Treex):
251
+ k = Treex.leaves()
252
+ Trex = ''
253
+ for i in k:
254
+ Trex += i + ' '
255
+ return Trex
256
+
257
+ def CompressionAlgorithm(LeadSents, TopPhrases, server):
258
+ CompressedSentences = []
259
+ for i in LeadSents:
260
+ Suppy = remove_punctuation(i)
261
+
262
+ ParsedSentence = Parsing(Suppy, server)
263
+
264
+ for i in ParsedSentence:
265
+ for j in i:
266
+ lefts = find_leftmost_S(j)
267
+ if lefts is not None:
268
+ LeftMostS = lefts
269
+ else:
270
+ LeftMostS = i
271
+ break
272
+
273
+ Labels = [ 'SBAR', 'DT', 'TMP', 'CC']
274
+ for i in Labels:
275
+ Temp = Pruning(LeftMostS, i)
276
+ LeftMostS = Temp
277
+
278
+ Trex = Extract(Temp)
279
+ Kalix = IterativeTrimming(Trex, TopPhrases, 120)
280
+
281
+ '''PS = Parsing(Kalix, server)
282
+ Tk = Pruning(PS, 'SBAR')
283
+
284
+ Trex = Extract(Tk)'''
285
+
286
+ CompressedSentences.append(Kalix)
287
+ return CompressedSentences
288
+
289
+ # Key Phrase Matching and Ranking
290
+
291
+ def SGRMatching(HeadLine, TopPhrases):
292
+ l, Flag, itre = len(TopPhrases), 0.0, 0
293
+ for Phrase in TopPhrases:
294
+ if Phrase in HeadLine:
295
+ Flag += (l - TopPhrases.index(Phrase)) / l
296
+ itre += 1
297
+ return (itre * Flag) / l
298
+ '''
299
+ if itre != 0:
300
+ return Flag / itre
301
+ else:
302
+ return -1'''
303
+
304
+ def Ranking(CompressedSentences, KeyPhrases):
305
+ ResultDict = {}
306
+ for i in CompressedSentences:
307
+ ResultDict[i] = SGRMatching(i, KeyPhrases)
308
+ return ResultDict
309
+
310
+ # Post Processing using DistilBert
311
+
312
+ #
313
+ # Split text to segments of length 200, with overlap 50
314
+ #
315
+ def split_to_segments(wrds, length, overlap):
316
+ resp = []
317
+ i = 0
318
+ while True:
319
+ wrds_split = wrds[(length * i):((length * (i + 1)) + overlap)]
320
+ if not wrds_split:
321
+ break
322
+
323
+ resp_obj = {
324
+ "text": wrds_split,
325
+ "start_idx": length * i,
326
+ "end_idx": (length * (i + 1)) + overlap,
327
+ }
328
+
329
+ resp.append(resp_obj)
330
+ i += 1
331
+ return resp
332
+
333
+
334
+ #
335
+ # Punctuate wordpieces
336
+ #
337
+ def punctuate_wordpiece(wordpiece, label):
338
+ if label.startswith('UPPER'):
339
+ wordpiece = wordpiece.upper()
340
+ elif label.startswith('Upper'):
341
+ wordpiece = wordpiece[0].upper() + wordpiece[1:]
342
+ if label[-1] != '_' and label[-1] != wordpiece[-1]:
343
+ wordpiece += label[-1]
344
+ return wordpiece
345
+
346
+
347
+ #
348
+ # Punctuate text segments (200 words)
349
+ #
350
+ def punctuate_segment(wordpieces, word_ids, labels, start_word):
351
+ result = ''
352
+ for idx in range(0, len(wordpieces)):
353
+ if word_ids[idx] == None:
354
+ continue
355
+ if word_ids[idx] < start_word:
356
+ continue
357
+ wordpiece = punctuate_wordpiece(wordpieces[idx][2:] if wordpieces[idx].startswith('##') else wordpieces[idx],
358
+ labels[idx])
359
+ if idx > 0 and len(result) > 0 and word_ids[idx] != word_ids[idx - 1] and result[-1] != '-':
360
+ result += ' '
361
+ result += wordpiece
362
+ return result
363
+
364
+
365
+ #
366
+ # Tokenize, predict, punctuate text segments (200 words)
367
+ #
368
+ def process_segment(words, tokenizer, model, start_word, encoder_max_length):
369
+
370
+ tokens = tokenizer(words['text'],
371
+ padding="max_length",
372
+ # truncation=True,
373
+ max_length=encoder_max_length,
374
+ is_split_into_words=True, return_tensors='pt')
375
+
376
+ with torch.no_grad():
377
+ logits = model(**tokens).logits
378
+ logits = logits.cpu()
379
+ predictions = np.argmax(logits, axis=-1)
380
+
381
+ wordpieces = tokens.tokens()
382
+ word_ids = tokens.word_ids()
383
+ id2label = model.config.id2label
384
+ labels = [[id2label[p.item()] for p in prediction] for prediction in predictions][0]
385
+
386
+ return punctuate_segment(wordpieces, word_ids, labels, start_word)
387
+
388
+
389
+ #
390
+ # Punctuate text of any length
391
+ #
392
+ def punctuate(text, tokenizer, model, encoder_max_length):
393
+ text = text.lower()
394
+ text = text.replace('\n', ' ')
395
+ words = text.split(' ')
396
+
397
+ overlap = 50
398
+ slices = split_to_segments(words, 150, 50)
399
+
400
+ result = ""
401
+ start_word = 0
402
+ for text in slices:
403
+ corrected = process_segment(text, tokenizer, model, start_word, encoder_max_length)
404
+ result += corrected + ' '
405
+ start_word = overlap
406
+ return result
407
+
408
+ def PostProcess(Sentence):
409
+ checkpoint = "venkatchoudharyala/Punctuate"
410
+ tokenizer = DistilBertTokenizerFast.from_pretrained(checkpoint)
411
+ model = DistilBertForTokenClassification.from_pretrained(checkpoint)
412
+ encoder_max_length = 256
413
+ return punctuate(Sentence, tokenizer, model, encoder_max_length)
414
+
415
+ def Generate(Article, Server):
416
+ cleaned_article = re.sub(r'\([^)]*\)', '', Article)
417
+
418
+ KeyPhrases = KeyPhraseSGRank(cleaned_article)
419
+
420
+ LSG = LeadSentencesOOPS(cleaned_article)
421
+ LeadingSentences = LSG.leading_sentences()
422
+ #LeadingSentences = leading_sentences(cleaned_article)
423
+ #LeadingSentences = get_first_sentences(cleaned_article)
424
+ print("...Leading Sentences Found...")
425
+ print(LeadingSentences)
426
+
427
+ CompressedSentences = CompressionAlgorithm(LeadingSentences, KeyPhrases, Server)
428
+
429
+ ResultDict = Ranking(CompressedSentences, KeyPhrases)
430
+
431
+ max_key = max(ResultDict, key=lambda k: ResultDict[k])
432
+ print("...Scores of Sentences...")
433
+ print(ResultDict)
434
+ return PostProcess(max_key)
build/lib/headline_gen/__init__.py ADDED
File without changes
dist/headline-gen-2.3.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5a5397807c25ff55f93a51b2734174a1add5524a6f3f6cd8a192fcd9c94a3df
3
+ size 6278
dist/headline-gen-2.4.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a755616c0f279d974304a1f1ea6bafc1f8170e06ad29a0f64a491ee1e1cab752
3
+ size 6286
dist/headline-gen-2.5.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b2bc4205713c78e56c31f6d0dce5db2e8a0074c37f2cfcf07f4da7b32b85ed5
3
+ size 6225
dist/headline-gen-2.6.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cf39696ea27452e3b5efab02eaa8031cfec599cb8271990b8fe37c8ab1015a7
3
+ size 6249
dist/headline_gen-2.3-py3-none-any.whl ADDED
Binary file (6.37 kB). View file
 
dist/headline_gen-2.4-py3-none-any.whl ADDED
Binary file (6.37 kB). View file
 
dist/headline_gen-2.5-py3-none-any.whl ADDED
Binary file (6.33 kB). View file
 
dist/headline_gen-2.6-py3-none-any.whl ADDED
Binary file (6.33 kB). View file
 
headline_gen.egg-info/PKG-INFO ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: headline-gen
3
+ Version: 2.6
4
+ Summary: Provides functionality to generate headlines from articles using natural language processing techniques.
5
+ Author: venkatchoudharyala
6
+ Author-email: [email protected]
7
+ Requires-Python: >=3.6
8
+ Description-Content-Type: text/markdown
9
+
10
+
11
+ # Headline Generation Package
12
+
13
+ This is a Python package for generating headlines from Articles.
14
+
15
+ ## Installation
16
+
17
+ You can install the package using pip:
18
+
19
+ ```bash
20
+ pip install headline-gen
21
+ ```
22
+
23
+ ## Usage
24
+
25
+ ```python
26
+ from headline_gen.Control import ServerCntrl, Generate
27
+
28
+ # Run this once to start the server
29
+ Server = ServerCntrl("Start")
30
+
31
+ # Generate headline from article text
32
+ headline = Generate("Your article text goes here...", Server)
33
+ print(headline)
34
+
35
+ # Stop the server when done
36
+ ServerCntrl("Stop", Server)
37
+ ```
38
+
39
+ ## Description
40
+
41
+ This package provides functionality to generate headlines from article text using natural language processing techniques.
42
+
43
+ ## Usage Instructions
44
+
45
+ 1. Import the `ServerCntrl` and `Generate` functions from the `Control` module.
46
+ 2. Start the server using `ServerCntrl("Start")`. This only needs to be done once.
47
+ 3. Generate headlines using the `Generate` function, passing the article text as an argument.
48
+ 4. Stop the server when done using `ServerCntrl("Stop", Server)`.
49
+
50
+ ## New Release Features (v2.6) and Bug Fixes
51
+
52
+ 1. Fixed a corner case issue causing a ZeroDivisionError when processing irregular parameters for phrase extraction. The package now gracefully handles such scenarios without disrupting functionality.
53
+ 2. Renamed the function `ServerInit` to `ServerCntrl` for improved clarity and consistency within the codebase.
54
+ 3. Additionally, streamlined the dependency management by directly including `en_core_web_sm` in the downloader module.
55
+ 4. Output made more Comprehensive.
headline_gen.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ README.md
2
+ setup.py
3
+ headline_gen/Control.py
4
+ headline_gen/__init__.py
5
+ headline_gen.egg-info/PKG-INFO
6
+ headline_gen.egg-info/SOURCES.txt
7
+ headline_gen.egg-info/dependency_links.txt
8
+ headline_gen.egg-info/requires.txt
9
+ headline_gen.egg-info/top_level.txt
headline_gen.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
headline_gen.egg-info/requires.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ requests
2
+ nltk
3
+ numpy
4
+ scipy==1.12.0
5
+ gensim
6
+ networkx
7
+ textacy
8
+ transformers
9
+ torch
10
+ spacy-download
headline_gen.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ headline_gen
headline_gen/Control.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import zipfile
3
+ import os
4
+ import nltk
5
+
6
+ from nltk.parse.corenlp import CoreNLPServer
7
+
8
+ from spacy_download import load_spacy
9
+ import textacy
10
+ from textacy import *
11
+ import string
12
+
13
+ import re
14
+ import numpy as np
15
+ from nltk.tokenize import sent_tokenize
16
+ from nltk.corpus import stopwords
17
+ from gensim.models import Word2Vec
18
+ from scipy.spatial import distance
19
+ import networkx as nx
20
+
21
+ #import string
22
+ from nltk.parse.corenlp import CoreNLPParser
23
+ from nltk.tree.tree import Tree
24
+
25
+ import torch
26
+ import numpy as np
27
+ from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification
28
+
29
+
30
+ #import re
31
+ #import os
32
+
33
+ # Downloader
34
+
35
+ def Downloader():
36
+ directory_path = "Parser/stanford-corenlp-4.5.6"
37
+ if os.path.exists(directory_path) and os.listdir(directory_path):
38
+ pass
39
+ else:
40
+ nlp = load_spacy("en_core_web_sm")
41
+ nltk.download('punkt')
42
+ nltk.download('stopwords')
43
+ url = "https://nlp.stanford.edu/software/stanford-corenlp-4.5.6.zip"
44
+
45
+ filename = "stanford-corenlp-4.5.6.zip"
46
+ directory = "./Parser/"
47
+
48
+ os.makedirs(directory, exist_ok=True)
49
+
50
+ response = requests.get(url)
51
+
52
+ if response.status_code == 200:
53
+ with open(os.path.join(directory, filename), 'wb') as f:
54
+ f.write(response.content)
55
+ print("Download successful.")
56
+
57
+ with zipfile.ZipFile(os.path.join(directory, filename), 'r') as zip_ref:
58
+ zip_ref.extractall(directory)
59
+ print("Extraction successful.")
60
+ else:
61
+ print("Failed to download file.")
62
+
63
+ def ServerCntrl(Mode, Server = None):
64
+ Path = 'Parser/'
65
+ os.environ['CLASSPATH'] = os.path.join(Path, 'stanford-corenlp-4.5.6')
66
+
67
+ directory_path = "Parser/stanford-corenlp-4.5.6"
68
+ if os.path.exists(directory_path) and os.listdir(directory_path):
69
+ if Mode == "Start":
70
+ server = CoreNLPServer()
71
+ server.start()
72
+ return server
73
+ elif Mode == "Stop":
74
+ if Server is None:
75
+ print("No Server Object Provided")
76
+ else:
77
+ Server.stop()
78
+ else:
79
+ print("Un defined Operation")
80
+ else:
81
+ print("Parser Files Not Found")
82
+ print("Attempting to Install Parser Files (This may take a Min or Two!!)")
83
+ Downloader()
84
+ if Mode == "Start":
85
+ server = ServerCntrl("Start")
86
+ return server
87
+
88
+ # Key Phrase Extraction
89
+
90
+ def remove_punctuation(text):
91
+ table = str.maketrans('', '', string.punctuation)
92
+ return text.translate(table)
93
+
94
+ def KeyPhraseSGRank(Article):
95
+ en = textacy.load_spacy_lang("en_core_web_sm")
96
+
97
+ Article = remove_punctuation(Article)
98
+
99
+ doc = textacy.make_spacy_doc(Article, lang=en)
100
+
101
+ TopPhrases = [kps for kps, weights in textacy.extract.keyterms.sgrank(doc, ngrams = (1, 3), topn=1.0)]
102
+ if len(TopPhrases) != 0:
103
+ print("...Key Phrases Found...")
104
+ print(TopPhrases)
105
+ return TopPhrases
106
+ else:
107
+ print("No Specific Key Phrases Found, Terminating the Execution...")
108
+ exit()
109
+
110
+ # Lead Sentence Extraction
111
+
112
+ class LeadSentencesOOPS:
113
+ def __init__(self, df):
114
+ self.df = df
115
+ self.sentences = sent_tokenize(self.df)
116
+
117
+ def pre_process(self):
118
+ sentences_clean = [re.sub(r'[^\w\s]', '', sentence.lower()) for sentence in self.sentences]
119
+ stop_words = stopwords.words('english')
120
+ sentence_tokens = [[words for words in sentence.split(' ') if words not in stop_words] for sentence in sentences_clean]
121
+ return sentence_tokens
122
+
123
+ def count_paragraphs(self):
124
+ text=self.df
125
+ paragraphs = re.split(r'\n\s*\n', text)
126
+ return (paragraphs,len(paragraphs))
127
+
128
+ def word2vec(self):
129
+ sentence_tokens = self.pre_process()
130
+ w2v = Word2Vec(sentence_tokens, vector_size=1, min_count=1, epochs=1500)
131
+ sentence_embeddings = []
132
+ max_len = max(len(tokens) for tokens in sentence_tokens)
133
+ for words in sentence_tokens:
134
+ embedding = [w2v.wv[word] for word in words]
135
+ padding_length = max_len - len(embedding)
136
+ padded_embedding = np.pad(embedding, [(0, padding_length), (0, 0)], mode='constant')
137
+ sentence_embeddings.append(padded_embedding)
138
+ return sentence_embeddings
139
+
140
+ def similarity_matrix(self):
141
+ sentence_tokens = self.pre_process()
142
+ sentence_embeddings = self.word2vec()
143
+ similarity_matrix = np.zeros([len(sentence_tokens), len(sentence_tokens)])
144
+ for i, row_embedding in enumerate(sentence_embeddings):
145
+ for j, column_embedding in enumerate(sentence_embeddings):
146
+ similarity_matrix[i][j] = 1 - distance.cosine(row_embedding.ravel(), column_embedding.ravel())
147
+ return similarity_matrix
148
+
149
+ def num_of_leadingsentences(self):
150
+ num_sentences = len(self.sentences)
151
+ if num_sentences < 5:
152
+ top = 1
153
+ elif num_sentences < 10:
154
+ top = 2
155
+ elif num_sentences < 25:
156
+ top = 4
157
+ elif num_sentences < 50:
158
+ top = 9
159
+ elif num_sentences < 100:
160
+ top = 18
161
+ elif num_sentences < 200:
162
+ top = 25
163
+ elif num_sentences >= 201:
164
+ top = 40
165
+ return top
166
+
167
+ def text_rank(self,num_sentences_to_extract):
168
+ li=[]
169
+ similarity_matrixs = self.similarity_matrix()
170
+ nx_graph = nx.from_numpy_array(similarity_matrixs)
171
+ scores = nx.pagerank(nx_graph)
172
+ top_sentence = {sentence: scores[index] for index, sentence in enumerate(self.sentences)}
173
+ top = dict(sorted(top_sentence.items(), key=lambda x: x[1], reverse=True)[:num_sentences_to_extract])
174
+ for sent in self.sentences:
175
+ if sent in top.keys():
176
+ li.append(sent)
177
+ return li
178
+
179
+ def leading_sentences(self):
180
+ article_info = self.count_paragraphs()
181
+ leading_sentences=[]
182
+ #if there is only one para in article then num_of_leading sentences are selected based on fixed constant
183
+ if article_info[1] <= 3:
184
+ num_sentences_to_extract=self.num_of_leadingsentences()
185
+ LSG_article = LeadSentencesOOPS(str(article_info[0]))
186
+ leading_sentences.extend(LSG_article.text_rank(num_sentences_to_extract))
187
+ #leading_sentences_corpus.append(leading_sentences)
188
+ else:
189
+ num_sentences_to_extract=1 #if there are more than one paras in article
190
+ paragraphs = article_info[0]
191
+ #print("num_paras: ",paragraphs)
192
+ #extracting one leading sentence from each paragraph
193
+ for para in paragraphs:
194
+ LSG = LeadSentencesOOPS(para)
195
+ output = LSG.text_rank(num_sentences_to_extract)
196
+ leading_sentences.extend(output)
197
+ #extractig leading sentence from entire article
198
+ LSG_article = LeadSentencesOOPS(para)
199
+ leading_sentences.extend(LSG_article.text_rank(num_sentences_to_extract))
200
+
201
+ return leading_sentences
202
+
203
+ # Parsing and Compression Algo
204
+
205
+ def remove_punctuation(text):
206
+ table = str.maketrans('', '', string.punctuation)
207
+ return text.translate(table)
208
+
209
+ def Parsing(Sentence, Server):
210
+ parser = CoreNLPParser(url=Server.url)
211
+ return next(parser.raw_parse(Sentence))
212
+
213
+ def find_leftmost_S(tree):
214
+ if isinstance(tree, str): # Terminal node
215
+ return None
216
+ elif tree.label() == 'S': # Found leftmost S node
217
+ return tree
218
+ else:
219
+ for subtree in tree:
220
+ result = find_leftmost_S(subtree)
221
+ if result is not None:
222
+ return result
223
+
224
+ def Pruning(tree, Label):
225
+ if isinstance(tree, str):
226
+ return tree
227
+ if tree.height() > 0:
228
+ filtered_children = [Pruning(child, Label) for child in tree if (isinstance(child, str) or child.height() > 0) and (isinstance(child, str) or child.label() != Label)]
229
+ return Tree(tree.label(), filtered_children)
230
+ else:
231
+ return tree
232
+
233
+ def IterativeTrimming(HeadLine, SGRankList, Threshold):
234
+ if len(HeadLine) > Threshold:
235
+ if len(SGRankList) > 0:
236
+ ptr = SGRankList[-1]
237
+ else:
238
+ return HeadLine
239
+ if HeadLine.find(ptr) > 0:
240
+ if HeadLine[HeadLine.find(ptr) - 1] != ' ':
241
+ HeadLine = HeadLine.replace(ptr, ":", 1)
242
+ else:
243
+ HeadLine = HeadLine.replace(' ' + ptr, "", 1)
244
+ else:
245
+ HeadLine = HeadLine.replace(ptr + ' ', "", 1)
246
+ return IterativeTrimming(HeadLine, SGRankList[: len(SGRankList) - 1], Threshold)
247
+ else:
248
+ return HeadLine
249
+
250
+ def Extract(Treex):
251
+ k = Treex.leaves()
252
+ Trex = ''
253
+ for i in k:
254
+ Trex += i + ' '
255
+ return Trex
256
+
257
+ def CompressionAlgorithm(LeadSents, TopPhrases, server):
258
+ CompressedSentences = []
259
+ for i in LeadSents:
260
+ Suppy = remove_punctuation(i)
261
+
262
+ ParsedSentence = Parsing(Suppy, server)
263
+
264
+ for i in ParsedSentence:
265
+ for j in i:
266
+ lefts = find_leftmost_S(j)
267
+ if lefts is not None:
268
+ LeftMostS = lefts
269
+ else:
270
+ LeftMostS = i
271
+ break
272
+
273
+ Labels = [ 'SBAR', 'DT', 'TMP', 'CC']
274
+ for i in Labels:
275
+ Temp = Pruning(LeftMostS, i)
276
+ LeftMostS = Temp
277
+
278
+ Trex = Extract(Temp)
279
+ Kalix = IterativeTrimming(Trex, TopPhrases, 120)
280
+
281
+ '''PS = Parsing(Kalix, server)
282
+ Tk = Pruning(PS, 'SBAR')
283
+
284
+ Trex = Extract(Tk)'''
285
+
286
+ CompressedSentences.append(Kalix)
287
+ return CompressedSentences
288
+
289
+ # Key Phrase Matching and Ranking
290
+
291
+ def SGRMatching(HeadLine, TopPhrases):
292
+ l, Flag, itre = len(TopPhrases), 0.0, 0
293
+ for Phrase in TopPhrases:
294
+ if Phrase in HeadLine:
295
+ Flag += (l - TopPhrases.index(Phrase)) / l
296
+ itre += 1
297
+ return (itre * Flag) / l
298
+ '''
299
+ if itre != 0:
300
+ return Flag / itre
301
+ else:
302
+ return -1'''
303
+
304
+ def Ranking(CompressedSentences, KeyPhrases):
305
+ ResultDict = {}
306
+ for i in CompressedSentences:
307
+ ResultDict[i] = SGRMatching(i, KeyPhrases)
308
+ return ResultDict
309
+
310
+ # Post Processing using DistilBert
311
+
312
+ #
313
+ # Split text to segments of length 200, with overlap 50
314
+ #
315
+ def split_to_segments(wrds, length, overlap):
316
+ resp = []
317
+ i = 0
318
+ while True:
319
+ wrds_split = wrds[(length * i):((length * (i + 1)) + overlap)]
320
+ if not wrds_split:
321
+ break
322
+
323
+ resp_obj = {
324
+ "text": wrds_split,
325
+ "start_idx": length * i,
326
+ "end_idx": (length * (i + 1)) + overlap,
327
+ }
328
+
329
+ resp.append(resp_obj)
330
+ i += 1
331
+ return resp
332
+
333
+
334
+ #
335
+ # Punctuate wordpieces
336
+ #
337
+ def punctuate_wordpiece(wordpiece, label):
338
+ if label.startswith('UPPER'):
339
+ wordpiece = wordpiece.upper()
340
+ elif label.startswith('Upper'):
341
+ wordpiece = wordpiece[0].upper() + wordpiece[1:]
342
+ if label[-1] != '_' and label[-1] != wordpiece[-1]:
343
+ wordpiece += label[-1]
344
+ return wordpiece
345
+
346
+
347
+ #
348
+ # Punctuate text segments (200 words)
349
+ #
350
+ def punctuate_segment(wordpieces, word_ids, labels, start_word):
351
+ result = ''
352
+ for idx in range(0, len(wordpieces)):
353
+ if word_ids[idx] == None:
354
+ continue
355
+ if word_ids[idx] < start_word:
356
+ continue
357
+ wordpiece = punctuate_wordpiece(wordpieces[idx][2:] if wordpieces[idx].startswith('##') else wordpieces[idx],
358
+ labels[idx])
359
+ if idx > 0 and len(result) > 0 and word_ids[idx] != word_ids[idx - 1] and result[-1] != '-':
360
+ result += ' '
361
+ result += wordpiece
362
+ return result
363
+
364
+
365
+ #
366
+ # Tokenize, predict, punctuate text segments (200 words)
367
+ #
368
+ def process_segment(words, tokenizer, model, start_word, encoder_max_length):
369
+
370
+ tokens = tokenizer(words['text'],
371
+ padding="max_length",
372
+ # truncation=True,
373
+ max_length=encoder_max_length,
374
+ is_split_into_words=True, return_tensors='pt')
375
+
376
+ with torch.no_grad():
377
+ logits = model(**tokens).logits
378
+ logits = logits.cpu()
379
+ predictions = np.argmax(logits, axis=-1)
380
+
381
+ wordpieces = tokens.tokens()
382
+ word_ids = tokens.word_ids()
383
+ id2label = model.config.id2label
384
+ labels = [[id2label[p.item()] for p in prediction] for prediction in predictions][0]
385
+
386
+ return punctuate_segment(wordpieces, word_ids, labels, start_word)
387
+
388
+
389
+ #
390
+ # Punctuate text of any length
391
+ #
392
+ def punctuate(text, tokenizer, model, encoder_max_length):
393
+ text = text.lower()
394
+ text = text.replace('\n', ' ')
395
+ words = text.split(' ')
396
+
397
+ overlap = 50
398
+ slices = split_to_segments(words, 150, 50)
399
+
400
+ result = ""
401
+ start_word = 0
402
+ for text in slices:
403
+ corrected = process_segment(text, tokenizer, model, start_word, encoder_max_length)
404
+ result += corrected + ' '
405
+ start_word = overlap
406
+ return result
407
+
408
+ def PostProcess(Sentence):
409
+ checkpoint = "venkatchoudharyala/Punctuate"
410
+ tokenizer = DistilBertTokenizerFast.from_pretrained(checkpoint)
411
+ model = DistilBertForTokenClassification.from_pretrained(checkpoint)
412
+ encoder_max_length = 256
413
+ return punctuate(Sentence, tokenizer, model, encoder_max_length)
414
+
415
+ def Generate(Article, Server):
416
+ cleaned_article = re.sub(r'\([^)]*\)', '', Article)
417
+
418
+ KeyPhrases = KeyPhraseSGRank(cleaned_article)
419
+
420
+ LSG = LeadSentencesOOPS(cleaned_article)
421
+ LeadingSentences = LSG.leading_sentences()
422
+ #LeadingSentences = leading_sentences(cleaned_article)
423
+ #LeadingSentences = get_first_sentences(cleaned_article)
424
+ print("...Leading Sentences Found...")
425
+ print(LeadingSentences)
426
+
427
+ CompressedSentences = CompressionAlgorithm(LeadingSentences, KeyPhrases, Server)
428
+
429
+ ResultDict = Ranking(CompressedSentences, KeyPhrases)
430
+
431
+ max_key = max(ResultDict, key=lambda k: ResultDict[k])
432
+ print("...Scores of Sentences...")
433
+ print(ResultDict)
434
+ return PostProcess(max_key)
headline_gen/__init__.py ADDED
File without changes
setup.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name='headline-gen',
5
+ version='2.6',
6
+ author='venkatchoudharyala',
7
+ author_email='[email protected]',
8
+ description='Provides functionality to generate headlines from articles using natural language processing techniques.',
9
+ long_description=open('README.md').read(), # Read the contents of README.md
10
+ long_description_content_type='text/markdown', # Specify the content type of the l
11
+ install_requires=[
12
+ 'requests',
13
+ 'nltk',
14
+ 'numpy',
15
+ 'scipy==1.12.0',
16
+ 'gensim',
17
+ 'networkx',
18
+ 'textacy',
19
+ 'transformers',
20
+ 'torch',
21
+ 'spacy-download'
22
+ ],
23
+ python_requires='>=3.6',
24
+ )