MatthiasC commited on
Commit
f51bffc
·
1 Parent(s): 8545c27

Add more articles/summaries and custom renderer (still needs to be cleaned up and tested further

Browse files
.idea/HFSummSpace.iml CHANGED
@@ -8,7 +8,7 @@
8
  <orderEntry type="sourceFolder" forTests="false" />
9
  </component>
10
  <component name="PyDocumentationSettings">
11
- <option name="format" value="PLAIN" />
12
- <option name="myDocStringFormat" value="Plain" />
13
  </component>
14
  </module>
 
8
  <orderEntry type="sourceFolder" forTests="false" />
9
  </component>
10
  <component name="PyDocumentationSettings">
11
+ <option name="format" value="EPYTEXT" />
12
+ <option name="myDocStringFormat" value="Epytext" />
13
  </component>
14
  </module>
README.md CHANGED
@@ -10,3 +10,7 @@ pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
 
 
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
13
+
14
+ sudo lsof -i:5000
15
+ kill -9 67007(=PID)
16
+
__pycache__/custom_renderer.cpython-37.pyc ADDED
Binary file (6.5 kB). View file
 
app.py CHANGED
@@ -3,9 +3,18 @@ from typing import AnyStr
3
 
4
  import streamlit as st
5
  from bs4 import BeautifulSoup
 
 
 
 
 
 
 
 
6
 
7
  import spacy
8
  from spacy import displacy
 
9
 
10
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
11
  from transformers import pipeline
@@ -50,6 +59,7 @@ st.set_page_config(
50
  }
51
  )
52
 
 
53
  # Model setup
54
  @st.cache(allow_output_mutation=True,
55
  suppress_st_warning=True,
@@ -72,7 +82,7 @@ def format_explainer_html(html_string):
72
  inside_token_prefix = '##'
73
  soup = BeautifulSoup(html_string, 'html.parser')
74
  p = soup.new_tag('p',
75
- attrs={'style': 'color: black; background-color: white;'})
76
  # Select token elements and remove model specific tokens
77
  current_word = None
78
  for token in soup.find_all('td')[-1].find_all('mark')[1:-1]:
@@ -101,6 +111,7 @@ def format_explainer_html(html_string):
101
 
102
  return p
103
 
 
104
  def list_all_article_names() -> list:
105
  filenames = []
106
  for file in os.listdir('./sample-articles/'):
@@ -108,16 +119,19 @@ def list_all_article_names() -> list:
108
  filenames.append(file.replace('.txt', ''))
109
  return filenames
110
 
 
111
  def fetch_article_contents(filename: str) -> AnyStr:
112
  with open(f'./sample-articles/{filename.lower()}.txt', 'r') as f:
113
  data = f.read()
114
  return data
115
 
 
116
  def fetch_summary_contents(filename: str) -> AnyStr:
117
  with open(f'./sample-summaries/{filename.lower()}.txt', 'r') as f:
118
  data = f.read()
119
  return data
120
 
 
121
  def classify_comment(comment, selected_model):
122
  """Classify the given comment and augment with additional information."""
123
  toxicity_pipeline, cls_explainer = load_pipeline(selected_model)
@@ -180,53 +194,230 @@ if 'results' not in st.session_state:
180
  # submitted = rightmost_col.form_submit_button("Classify",
181
  # help="Classify comment")
182
 
183
- with st.form("article-input"):
184
- # TODO: should probably set a minimum length of article or something
185
- selected_article = st.selectbox('Select an article or provide your own:', list_all_article_names(),
186
- )#index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
187
- st.session_state.article_text = fetch_article_contents(selected_article)
188
- article_text = st.text_area(
189
- label='Enter the comment you want to classify below (in Dutch):',
190
- value = st.session_state.article_text)
191
- _, rightmost_col = st.columns([6,1])
192
- get_summary = rightmost_col.form_submit_button("Generate summary",
193
- help="Generate summary for the given article text")
 
 
 
 
194
 
195
 
196
  def display_summary(article_name: str):
197
- st.subheader("GENERATED SUMMARY")
198
- st.markdown("######")
199
  summary_content = fetch_summary_contents(article_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  nlp = spacy.load('en_core_web_sm')
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  doc = nlp(summary_content)
202
- html = displacy.render(doc, style="ent")
203
- html = html.replace("\n", " ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
205
- st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
206
- st.markdown(summary_content)
207
-
208
- # Listener
209
- if get_summary:
210
- if article_text:
211
- with st.spinner('Generating summary...'):
212
- #classify_comment(article_text, selected_model)
213
- display_summary(selected_article)
214
- else:
215
- st.error('**Error**: No comment to classify. Please provide a comment.')
216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  # Results
218
- if 'results' in st.session_state and st.session_state.results:
219
- first = True
220
- for result in st.session_state.results[::-1]:
221
- if not first:
222
- st.markdown("---")
223
- st.markdown(f"Text:\n> {result['text']}")
224
- col_1, col_2, col_3 = st.columns([1,2,2])
225
- col_1.metric(label='', value=f"{result['emoji']}")
226
- col_2.metric(label='Label', value=f"{result['label']}")
227
- col_3.metric(label='Score', value=f"{result['score']:.3f}")
228
- st.markdown(f"Token Attribution:\n{result['tokens_with_background']}",
229
- unsafe_allow_html=True)
230
- st.caption(f"Model: {result['model_name']}")
231
- first = False
232
-
 
3
 
4
  import streamlit as st
5
  from bs4 import BeautifulSoup
6
+ import numpy as np
7
+ import base64
8
+
9
+ from spacy_streamlit.util import get_svg
10
+
11
+ from custom_renderer import render_sentence_custom
12
+ from flair.data import Sentence
13
+ from flair.models import SequenceTagger
14
 
15
  import spacy
16
  from spacy import displacy
17
+ from spacy_streamlit import visualize_parser
18
 
19
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
20
  from transformers import pipeline
 
59
  }
60
  )
61
 
62
+
63
  # Model setup
64
  @st.cache(allow_output_mutation=True,
65
  suppress_st_warning=True,
 
82
  inside_token_prefix = '##'
83
  soup = BeautifulSoup(html_string, 'html.parser')
84
  p = soup.new_tag('p',
85
+ attrs={'style': 'color: black; background-color: white;'})
86
  # Select token elements and remove model specific tokens
87
  current_word = None
88
  for token in soup.find_all('td')[-1].find_all('mark')[1:-1]:
 
111
 
112
  return p
113
 
114
+
115
  def list_all_article_names() -> list:
116
  filenames = []
117
  for file in os.listdir('./sample-articles/'):
 
119
  filenames.append(file.replace('.txt', ''))
120
  return filenames
121
 
122
+
123
  def fetch_article_contents(filename: str) -> AnyStr:
124
  with open(f'./sample-articles/{filename.lower()}.txt', 'r') as f:
125
  data = f.read()
126
  return data
127
 
128
+
129
  def fetch_summary_contents(filename: str) -> AnyStr:
130
  with open(f'./sample-summaries/{filename.lower()}.txt', 'r') as f:
131
  data = f.read()
132
  return data
133
 
134
+
135
  def classify_comment(comment, selected_model):
136
  """Classify the given comment and augment with additional information."""
137
  toxicity_pipeline, cls_explainer = load_pipeline(selected_model)
 
194
  # submitted = rightmost_col.form_submit_button("Classify",
195
  # help="Classify comment")
196
 
197
+
198
+ # TODO: should probably set a minimum length of article or something
199
+ selected_article = st.selectbox('Select an article or provide your own:',
200
+ list_all_article_names()) # index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
201
+ st.session_state.article_text = fetch_article_contents(selected_article)
202
+ article_text = st.text_area(
203
+ label='Full article text',
204
+ value=st.session_state.article_text,
205
+ height=250
206
+ )
207
+
208
+
209
+ # _, rightmost_col = st.columns([5, 1])
210
+ # get_summary = rightmost_col.button("Generate summary",
211
+ # help="Generate summary for the given article text")
212
 
213
 
214
  def display_summary(article_name: str):
215
+ st.subheader("Generated summary")
216
+ # st.markdown("######")
217
  summary_content = fetch_summary_contents(article_name)
218
+ soup = BeautifulSoup(summary_content, features="html.parser")
219
+ HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
220
+ st.session_state.summary_output = HTML_WRAPPER.format(soup)
221
+ st.write(st.session_state.summary_output, unsafe_allow_html=True)
222
+
223
+
224
+ # TODO: this functionality can be cached (e.g. by storing html file output) if wanted (or just store list of entities idk)
225
+ def get_and_compare_entities_spacy(article_name: str):
226
+ nlp = spacy.load('en_core_web_lg')
227
+
228
+ article_content = fetch_article_contents(article_name)
229
+ doc = nlp(article_content)
230
+ # entities_article = doc.ents
231
+ entities_article = []
232
+ for entity in doc.ents:
233
+ entities_article.append(str(entity))
234
+
235
+ summary_content = fetch_summary_contents(article_name)
236
+ doc = nlp(summary_content)
237
+ # entities_summary = doc.ents
238
+ entities_summary = []
239
+ for entity in doc.ents:
240
+ entities_summary.append(str(entity))
241
+
242
+ matched_entities = []
243
+ unmatched_entities = []
244
+ for entity in entities_summary:
245
+ # TODO: currently substring matching but probably should do embedding method or idk?
246
+ if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
247
+ matched_entities.append(entity)
248
+ else:
249
+ unmatched_entities.append(entity)
250
+ # print(entities_article)
251
+ # print(entities_summary)
252
+ return matched_entities, unmatched_entities
253
+
254
+
255
+ def get_and_compare_entities_flair(article_name: str):
256
  nlp = spacy.load('en_core_web_sm')
257
+ tagger = SequenceTagger.load("flair/ner-english-ontonotes-fast")
258
+
259
+ article_content = fetch_article_contents(article_name)
260
+ doc = nlp(article_content)
261
+ entities_article = []
262
+ sentences = list(doc.sents)
263
+ for sentence in sentences:
264
+ sentence_entities = Sentence(str(sentence))
265
+ tagger.predict(sentence_entities)
266
+ for entity in sentence_entities.get_spans('ner'):
267
+ entities_article.append(entity.text)
268
+
269
+ summary_content = fetch_summary_contents(article_name)
270
  doc = nlp(summary_content)
271
+ entities_summary = []
272
+ sentences = list(doc.sents)
273
+ for sentence in sentences:
274
+ sentence_entities = Sentence(str(sentence))
275
+ tagger.predict(sentence_entities)
276
+ for entity in sentence_entities.get_spans('ner'):
277
+ entities_summary.append(entity.text)
278
+
279
+ matched_entities = []
280
+ unmatched_entities = []
281
+ for entity in entities_summary:
282
+ # TODO: currently substring matching but probably should do embedding method or idk?
283
+ if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
284
+ matched_entities.append(entity)
285
+ else:
286
+ unmatched_entities.append(entity)
287
+ # print(entities_article)
288
+ # print(entities_summary)
289
+ return matched_entities, unmatched_entities
290
+
291
+
292
+ def highlight_entities(article_name: str):
293
+ st.subheader("Match entities with article")
294
+ # st.markdown("####")
295
+ summary_content = fetch_summary_contents(article_name)
296
+
297
+ markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
298
+ markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
299
+ markdown_end = "</mark>"
300
+
301
+ matched_entities, unmatched_entities = get_and_compare_entities_spacy(article_name)
302
+ for entity in matched_entities:
303
+ summary_content = summary_content.replace(entity, markdown_start_green + entity + markdown_end)
304
+
305
+ for entity in unmatched_entities:
306
+ summary_content = summary_content.replace(entity, markdown_start_red + entity + markdown_end)
307
+ soup = BeautifulSoup(summary_content, features="html.parser")
308
+
309
  HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
 
 
 
 
 
 
 
 
 
 
 
310
 
311
+ st.write(HTML_WRAPPER.format(soup), unsafe_allow_html=True)
312
+
313
+
314
+ def render_dependency_parsing(text: str):
315
+ nlp = spacy.load('en_core_web_sm')
316
+ #doc = nlp(text)
317
+ # st.write(displacy.render(doc, style='dep'))
318
+ #sentence_spans = list(doc.sents)
319
+ # dep_svg = displacy.serve(sentence_spans, style="dep")
320
+ # dep_svg = displacy.render(doc, style="dep", jupyter = False,
321
+ # options = {"compact" : False,})
322
+ # st.image(dep_svg, width = 50,use_column_width=True)
323
+
324
+ #visualize_parser(doc)
325
+ #docs = [doc]
326
+ #split_sents = True
327
+ #docs = [span.as_doc() for span in doc.sents] if split_sents else [doc]
328
+ #for sent in docs:
329
+ html = render_sentence_custom(text)
330
+ # Double newlines seem to mess with the rendering
331
+ html = html.replace("\n\n", "\n")
332
+ st.write(get_svg(html), unsafe_allow_html=True)
333
+ #st.image(html, width=50, use_column_width=True)
334
+
335
+
336
+ def check_dependency(text):
337
+ tagger = SequenceTagger.load("flair/ner-english-ontonotes-fast")
338
+ nlp = spacy.load('en_core_web_lg')
339
+ doc = nlp(text)
340
+ tok_l = doc.to_json()['tokens']
341
+ # all_deps = []
342
+ all_deps = ""
343
+ sentences = list(doc.sents)
344
+ for sentence in sentences:
345
+ all_entities = []
346
+ # # ENTITIES WITH SPACY:
347
+ for entity in sentence.ents:
348
+ all_entities.append(str(entity))
349
+ # # ENTITIES WITH FLAIR:
350
+ sentence_entities = Sentence(str(sentence))
351
+ tagger.predict(sentence_entities)
352
+ for entity in sentence_entities.get_spans('ner'):
353
+ all_entities.append(entity.text)
354
+ # ENTITIES WITH XLM ROBERTA
355
+ # entities_xlm = [entity["word"] for entity in ner_model(str(sentence))]
356
+ # for entity in entities_xlm:
357
+ # all_entities.append(str(entity))
358
+ start_id = sentence.start
359
+ end_id = sentence.end
360
+ for t in tok_l:
361
+ if t["id"] < start_id or t["id"] > end_id:
362
+ continue
363
+ head = tok_l[t['head']]
364
+ if t['dep'] == 'amod':
365
+ object_here = text[t['start']:t['end']]
366
+ object_target = text[head['start']:head['end']]
367
+ # ONE NEEDS TO BE ENTITY
368
+ if (object_here in all_entities):
369
+ # all_deps.append(f"'{text[t['start']:t['end']]}' is {t['dep']} of '{text[head['start']:head['end']]}'")
370
+ all_deps = all_deps.join(str(sentence))
371
+ elif (object_target in all_entities):
372
+ # all_deps.append(f"'{text[t['start']:t['end']]}' is {t['dep']} of '{text[head['start']:head['end']]}'")
373
+ all_deps = all_deps.join(str(sentence))
374
+ else:
375
+ continue
376
+ return all_deps
377
+
378
+
379
+ with st.form("article-input"):
380
+ left_column, _ = st.columns([1, 1])
381
+ get_summary = left_column.form_submit_button("Generate summary",
382
+ help="Generate summary for the given article text")
383
+ # Listener
384
+ if get_summary:
385
+ if article_text:
386
+ with st.spinner('Generating summary...'):
387
+ # classify_comment(article_text, selected_model)
388
+
389
+ display_summary(selected_article)
390
+ else:
391
+ st.error('**Error**: No comment to classify. Please provide a comment.')
392
+
393
+ # Entity part
394
+ with st.form("Entity-part"):
395
+ left_column, _ = st.columns([1, 1])
396
+ draw_entities = left_column.form_submit_button("Draw Entities",
397
+ help="Draw Entities")
398
+ if draw_entities:
399
+ with st.spinner("Drawing entities..."):
400
+ highlight_entities(selected_article)
401
+
402
+ with st.form("Dependency-usage"):
403
+ left_column, _ = st.columns([1, 1])
404
+ parsing = left_column.form_submit_button("Dependency parsing",
405
+ help="Dependency parsing")
406
+ if parsing:
407
+ with st.spinner("Doing dependency parsing..."):
408
+ render_dependency_parsing(check_dependency(fetch_summary_contents(selected_article)))
409
  # Results
410
+ # if 'results' in st.session_state and st.session_state.results:
411
+ # first = True
412
+ # for result in st.session_state.results[::-1]:
413
+ # if not first:
414
+ # st.markdown("---")
415
+ # st.markdown(f"Text:\n> {result['text']}")
416
+ # col_1, col_2, col_3 = st.columns([1,2,2])
417
+ # col_1.metric(label='', value=f"{result['emoji']}")
418
+ # col_2.metric(label='Label', value=f"{result['label']}")
419
+ # col_3.metric(label='Score', value=f"{result['score']:.3f}")
420
+ # st.markdown(f"Token Attribution:\n{result['tokens_with_background']}",
421
+ # unsafe_allow_html=True)
422
+ # st.caption(f"Model: {result['model_name']}")
423
+ # first = False
 
arial.ttf ADDED
Binary file (312 kB). View file
 
custom_renderer.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+
3
+ import spacy
4
+ from PIL import ImageFont
5
+
6
+ from spacy.tokens import Doc
7
+
8
+ def get_pil_text_size(text, font_size, font_name):
9
+ font = ImageFont.truetype(font_name, font_size)
10
+ size = font.getsize(text)
11
+ return size
12
+
13
+
14
+ def render_arrow(
15
+ label: str, start: int, end: int, direction: str, i: int
16
+ ) -> str:
17
+ """Render individual arrow.
18
+
19
+ label (str): Dependency label.
20
+ start (int): Index of start word.
21
+ end (int): Index of end word.
22
+ direction (str): Arrow direction, 'left' or 'right'.
23
+ i (int): Unique ID, typically arrow index.
24
+ RETURNS (str): Rendered SVG markup.
25
+ """
26
+ TPL_DEP_ARCS = """
27
+ <g class="displacy-arrow">
28
+ <path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="red"/>
29
+ <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
30
+ <textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" side="{label_side}" fill="red" text-anchor="middle">{label}</textPath>
31
+ </text>
32
+ <path class="displacy-arrowhead" d="{head}" fill="red"/>
33
+ </g>
34
+ """
35
+ arc = get_arc(start + 20, 50, 5, end + 20)
36
+ arrowhead = get_arrowhead(direction, start + 20, 50, end + 20)
37
+ label_side = "right" if direction == "rtl" else "left"
38
+ return TPL_DEP_ARCS.format(
39
+ id=0,
40
+ i=0,
41
+ stroke=2,
42
+ head=arrowhead,
43
+ label=label,
44
+ label_side=label_side,
45
+ arc=arc,
46
+ )
47
+
48
+
49
+ def get_arc(x_start: int, y: int, y_curve: int, x_end: int) -> str:
50
+ """Render individual arc.
51
+
52
+ x_start (int): X-coordinate of arrow start point.
53
+ y (int): Y-coordinate of arrow start and end point.
54
+ y_curve (int): Y-corrdinate of Cubic Bézier y_curve point.
55
+ x_end (int): X-coordinate of arrow end point.
56
+ RETURNS (str): Definition of the arc path ('d' attribute).
57
+ """
58
+ template = "M{x},{y} C{x},{c} {e},{c} {e},{y}"
59
+ return template.format(x=x_start, y=y, c=y_curve, e=x_end)
60
+
61
+
62
+ def get_arrowhead(direction: str, x: int, y: int, end: int) -> str:
63
+ """Render individual arrow head.
64
+
65
+ direction (str): Arrow direction, 'left' or 'right'.
66
+ x (int): X-coordinate of arrow start point.
67
+ y (int): Y-coordinate of arrow start and end point.
68
+ end (int): X-coordinate of arrow end point.
69
+ RETURNS (str): Definition of the arrow head path ('d' attribute).
70
+ """
71
+ arrow_width = 6
72
+ if direction == "left":
73
+ p1, p2, p3 = (x, x - arrow_width + 2, x + arrow_width - 2)
74
+ else:
75
+ p1, p2, p3 = (end, end + arrow_width - 2, end - arrow_width + 2)
76
+ return f"M{p1},{y + 2} L{p2},{y - arrow_width} {p3},{y - arrow_width}"
77
+
78
+
79
+ # parsed = [{'words': [{'text': 'The', 'tag': 'DET', 'lemma': None}, {'text': 'OnePlus', 'tag': 'PROPN', 'lemma': None}, {'text': '10', 'tag': 'NUM', 'lemma': None}, {'text': 'Pro', 'tag': 'PROPN', 'lemma': None}, {'text': 'is', 'tag': 'AUX', 'lemma': None}, {'text': 'the', 'tag': 'DET', 'lemma': None}, {'text': 'company', 'tag': 'NOUN', 'lemma': None}, {'text': "'s", 'tag': 'PART', 'lemma': None}, {'text': 'first', 'tag': 'ADJ', 'lemma': None}, {'text': 'flagship', 'tag': 'NOUN', 'lemma': None}, {'text': 'phone.', 'tag': 'NOUN', 'lemma': None}], 'arcs': [{'start': 0, 'end': 3, 'label': 'det', 'dir': 'left'}, {'start': 1, 'end': 3, 'label': 'nmod', 'dir': 'left'}, {'start': 1, 'end': 2, 'label': 'nummod', 'dir': 'right'}, {'start': 3, 'end': 4, 'label': 'nsubj', 'dir': 'left'}, {'start': 5, 'end': 6, 'label': 'det', 'dir': 'left'}, {'start': 6, 'end': 10, 'label': 'poss', 'dir': 'left'}, {'start': 6, 'end': 7, 'label': 'case', 'dir': 'right'}, {'start': 8, 'end': 10, 'label': 'amod', 'dir': 'left'}, {'start': 9, 'end': 10, 'label': 'compound', 'dir': 'left'}, {'start': 4, 'end': 10, 'label': 'attr', 'dir': 'right'}], 'settings': {'lang': 'en', 'direction': 'ltr'}}]
80
+ def render_sentence_custom(parsed: str):
81
+ TPL_DEP_WORDS = """
82
+ <text class="displacy-token" fill="currentColor" text-anchor="start" y="{y}">
83
+ <tspan class="displacy-word" fill="currentColor" x="{x}">{text}</tspan>
84
+ <tspan class="displacy-tag" dy="2em" fill="currentColor" x="{x}">{tag}</tspan>
85
+ </text>
86
+ """
87
+
88
+ TPL_DEP_SVG = """
89
+ <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="{lang}" id="{id}" class="displacy" width="{width}" height="{height}" direction="{dir}" style="max-width: none; height: {height}px; color: {color}; background: {bg}; font-family: {font}; direction: {dir}">{content}</svg>
90
+ """
91
+ arcs_svg = []
92
+ couples = []
93
+ nlp = spacy.load('en_core_web_sm')
94
+ doc = nlp(parsed)
95
+ arcs = {}
96
+ words = {}
97
+ parsed = [parse_deps(doc)]
98
+ for i, p in enumerate(parsed):
99
+ arcs = p["arcs"]
100
+ words = p["words"]
101
+ for i, a in enumerate(arcs):
102
+ if a["label"] == "amod":
103
+ couples = (a["start"], a["end"])
104
+
105
+ print(couples)
106
+ x_value_counter = 10
107
+ index_counter = 0
108
+ svg_words = []
109
+ coords_test = []
110
+ for i, word in enumerate(words):
111
+ word = word["text"]
112
+ word = word + " "
113
+ pixel_x_length = get_pil_text_size(word, 16, 'arial.ttf')[0]
114
+ svg_words.append(TPL_DEP_WORDS.format(text=word, tag="", x=x_value_counter, y=70))
115
+ print(index_counter)
116
+ if index_counter >= couples[0] and index_counter <= couples[1]:
117
+ coords_test.append(x_value_counter)
118
+ x_value_counter += 50
119
+ index_counter += 1
120
+ x_value_counter += pixel_x_length + 4
121
+ print(coords_test)
122
+ for i, a in enumerate(arcs):
123
+ if a["label"] == "amod":
124
+ arcs_svg.append(render_arrow(a["label"], coords_test[0], coords_test[-1], a["dir"], i))
125
+
126
+ content = "".join(svg_words) + "".join(arcs_svg)
127
+
128
+ full_svg = TPL_DEP_SVG.format(
129
+ id=0,
130
+ width=1975,
131
+ height=574.5,
132
+ color="#00000",
133
+ bg="#ffffff",
134
+ font="Arial",
135
+ content=content,
136
+ dir="ltr",
137
+ lang="en",
138
+ )
139
+
140
+ return full_svg
141
+
142
+ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
143
+ """Generate dependency parse in {'words': [], 'arcs': []} format.
144
+
145
+ doc (Doc): Document do parse.
146
+ RETURNS (dict): Generated dependency parse keyed by words and arcs.
147
+ """
148
+ doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
149
+ if not doc.has_annotation("DEP"):
150
+ print("WARNING")
151
+ if options.get("collapse_phrases", False):
152
+ with doc.retokenize() as retokenizer:
153
+ for np in list(doc.noun_chunks):
154
+ attrs = {
155
+ "tag": np.root.tag_,
156
+ "lemma": np.root.lemma_,
157
+ "ent_type": np.root.ent_type_,
158
+ }
159
+ retokenizer.merge(np, attrs=attrs)
160
+ if options.get("collapse_punct", True):
161
+ spans = []
162
+ for word in doc[:-1]:
163
+ if word.is_punct or not word.nbor(1).is_punct:
164
+ continue
165
+ start = word.i
166
+ end = word.i + 1
167
+ while end < len(doc) and doc[end].is_punct:
168
+ end += 1
169
+ span = doc[start:end]
170
+ spans.append((span, word.tag_, word.lemma_, word.ent_type_))
171
+ with doc.retokenize() as retokenizer:
172
+ for span, tag, lemma, ent_type in spans:
173
+ attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type}
174
+ retokenizer.merge(span, attrs=attrs)
175
+ fine_grained = options.get("fine_grained")
176
+ add_lemma = options.get("add_lemma")
177
+ words = [
178
+ {
179
+ "text": w.text,
180
+ "tag": w.tag_ if fine_grained else w.pos_,
181
+ "lemma": w.lemma_ if add_lemma else None,
182
+ }
183
+ for w in doc
184
+ ]
185
+ arcs = []
186
+ for word in doc:
187
+ if word.i < word.head.i:
188
+ arcs.append(
189
+ {"start": word.i, "end": word.head.i, "label": word.dep_, "dir": "left"}
190
+ )
191
+ elif word.i > word.head.i:
192
+ arcs.append(
193
+ {
194
+ "start": word.head.i,
195
+ "end": word.i,
196
+ "label": word.dep_,
197
+ "dir": "right",
198
+ }
199
+ )
200
+ return {"words": words, "arcs": arcs, "settings": get_doc_settings(orig_doc)}
201
+
202
+ def get_doc_settings(doc: Doc) -> Dict[str, Any]:
203
+ return {
204
+ "lang": doc.lang_,
205
+ "direction": doc.vocab.writing_system.get("direction", "ltr"),
206
+ }
requirements.txt CHANGED
@@ -3,4 +3,7 @@ streamlit==1.2.0
3
  transformers==4.15.0
4
  transformers-interpret==0.5.2
5
  spacy==3.0.0
 
 
6
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
 
 
3
  transformers==4.15.0
4
  transformers-interpret==0.5.2
5
  spacy==3.0.0
6
+ spacy_streamlit==1.0.3
7
+ flair
8
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
9
+ en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.0.0/en_core_web_lg-3.0.0.tar.gz
sample-articles/article13.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ We're already seeing the effects of the Oppo merger on OnePlus' next flagship.
2
+ Ron Amadeo - Jan 5, 2022 1:00 am UTC
3
+
4
+ Enlarge / The OnePlus 10 Pro.
5
+
6
+ OnePlus
7
+
8
+ Official product news about the upcoming OnePlus 10 Pro has begun to trickle out. For now, we have an incomplete overview with some pictures and specs, while things like a price, release date, and the finer details will have to wait for later.
9
+
10
+ First up: specs. OnePlus 10 Pro officially has the brand-new Qualcomm Snapdragon 8 Gen 1 SoC. This is Qualcomm's new flagship SoC for 2022, and it features a single ARM Cortex X2 core, three medium Cortex A710 CPUs, and four small Cortex A510 CPUs, all built on a 4 nm process. OnePlus isn't saying how much RAM and storage the 10 Pro has, but the 9 Pro came with 8GB or 12GB of RAM and 128GB or 256GB of storage. The company confirmed the display is 120 Hz but didn't give a size, though rumors say it's 6.7-inch, the same as the OnePlus 9 Pro. That fits the now-official dimensions, which are 163 × 73.9 × 8.55 mm.
11
+
12
+ The battery is officially 5000 mAh, an upgrade over the 9 Pro's 4500 mAh battery. Considering the similar dimensions between the two phones, this is a welcome upgrade in battery density. OnePlus is also up to a whopping 80 W "SuperVOOC" quick charging now—an improvement over last year's 65 W "Warp Charge." OnePlus doesn't give any indication of what kind of charge time we can expect, but 65 W could charge the 9 Pro's 4500 mAh battery from 0-100 in a half-hour. Charging speed is still outpacing battery growth, so the 10 Pro should charge in under a half-hour. Just like last year, wireless charging is 50 W.
13
+
14
+ Enlarge / Another look at that wacky camera block.
15
+
16
+ OnePlus
17
+
18
+ OnePlus has pitched itself as a scrappy startup in the past, but it's actually owned by the Chinese company BBK Electronics, one of the world's largest smartphone manufacturers. Just like General Motors, BBK has multiple brands (OnePlus, Oppo, Vivo, Realme, and iQOO) targeting different markets, and they share plenty of parts and engineering. While OnePlus and Oppo have always shared some engineering resources, last year it was announced OnePlus would actually be folded into Oppo.
19
+
20
+ The Oppoization of OnePlus is going to be a major narrative for the OnePlus 10 Pro. We can already see a bit of it with the change from "Warp Charging" (OnePlus branding) to "SuperVOOC" (Oppo branding). But what really matters is the software, which will see OnePlus adopt Oppo's Color OS Android skin with a few custom tweaks rather than the separate codebases the two companies were running. We got a glimpse of this design direction via the OnePlus 9's Android 12 update, and the reviews were not kind. But we'll see what the first new phone software brings.
21
+
22
+ As for the design, the camera block is really the only area where Android OEMs allow themselves to differentiate from the norm. This year, OnePlus is going with this square-ish design that wraps around the side of the phone. It looks a lot like the Galaxy S21 Ultra camera block, except that it's wrapped around the entire corner. Inside the camera block are three cameras and an LED flash. Right now, OnePlus is only disclosing megapixel counts, and those are 48MP, 50MP, and 8MP.
23
+
24
+ Enlarge / This is not an official picture, but OnLeaks' clearly accurate leak from November is still our only look at the front of the phone.
25
+
26
+ We don't actually have a picture of the front yet, so above is OnLeak's unofficial render from a few months ago. This has the camera hole on the left side instead of the middle. Other than that, it looks like every other Android phone on the market.
27
+
28
+ It might be because of Oppo's influence, but OnePlus' launch is all sorts of weird this year. The phone is launching in China first on January 11. We don't have a price yet, but OnePlus' flagship prices have gone up every year so far, and the 9 Pro was $969. There's also no word on a US release date yet.
sample-articles/article16.txt ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SINGAPORE, Jan 5 (Reuters) - Chinese gaming and social media company Tencent Holdings Ltd (0700.HK) has raised $3 billion by selling 14.5 million shares at $208 each in Sea, which owns e-commerce firm Shopee, according to a term sheet seen by Reuters on Wednesday.
2
+
3
+ Tencent said late on Tuesday it had entered into a deal to reduce its stake in the Singapore-based gaming and e-commerce group to 18.7% from 21.3%. The company plans to retain the substantial majority of its stake in Sea for the long term.
4
+
5
+ The sale comes after Tencent said last month it would divest $16.4 billion of its stake in JD.com (9618.HK), weakening its ties to China's second-biggest e-commerce firm, amid pressure from Beijing's broad regulatory crackdown on technology firms. read more
6
+
7
+ Register now for FREE unlimited access to Reuters.com
8
+ Sea's shares fell 11.4% on Tuesday in New York to $197.8 following the divestment news. Ahead of the announcement, Sea said Tencent had also agreed to cut its voting stake in the company to less than 10%.
9
+
10
+ "We believe with a lower voting right control, it could reduce any potential conflict if Tencent's gaming teams plan to publish more games directly in global markets and help reduce any potential geopolitical friction if/when Sea plans to expand more strategically into new markets in more countries," Citi's analysts said in a report on Wednesday.
11
+
12
+ Sea said Tencent and its affiliates had given an irrevocable notice to convert all their Class B ordinary shares.
13
+
14
+ Upon conversion, all outstanding class B shares of Sea will be beneficially owned by Forrest Li, the founder, chairman and CEO of Sea, Southeast Asia's most valued company, which has a market capitalisation of $110 billion.
15
+
16
+ Tencent and Sea declined to comment on the pricing of the share sale.
17
+
18
+ Guotai Junan International analyst Vincent Liu said he did not see Tencent's move to trim its Sea stake as surprising, given its recent JD.com divestment. Tencent owns a huge, diversified investment portfolio so buying or selling shares in its investees could be considered a "regular action", he said.
19
+
20
+ "On the other hand, we think that this reflects some of Tencent's adjustments in business strategy, especially under the circumstance of tightening regulations on anti-trust," he added.
21
+
22
+ Sea's shares have shed 47% from a record high of $372 struck in October but have still risen five-fold in the past three years.
23
+
24
+ The company started out as a gaming firm in 2009 and then diversified into e-commerce and food delivery, benefiting from roaring demand for its services from consumers, especially during pandemic-related restrictions.
25
+
26
+ Sea is now expanding its e-commerce operations globally. read more
27
+
28
+ "The divestment provides Tencent with resources to fund other investments and social initiatives," Tencent said in a statement.
29
+
30
+ It sold the stock at the lower end of the $208-$212 per share range when the transaction was launched on Tuesday. The price set was a 6.8% discount to Sea's closing price of $223.3 on Monday.
31
+
32
+ Tencent's shares fell 3.5% on Wednesday in a broader market, weighed down by tech stocks.
33
+
34
+ Tencent will be subject to a lockup period that restricts further sale of Sea shares by Tencent during the next six months.
35
+
36
+ Separately, Sea is proposing to increase the voting power of each Class B ordinary share to 15 votes from three.
37
+
38
+ "The board believes that, as Sea has scaled significantly to become a leading global consumer internet company, it is in the best interests of the company in pursuing its long-term growth strategies to further clarify its capital structure through the contemplated changes," it said.
39
+
40
+ Sea said the changes are subject to approval by its shareholders.
41
+
42
+ It said that once the changes are made, the outstanding Class B ordinary shares beneficially owned by Li are expected to represent about 57% of the voting power, up from about 52%.
43
+
44
+ Separately, Li holds about 54% of the total voting power related to the size and composition of Sea's board of directors.
sample-summaries/article13.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ The OnePlus 10 Pro is the company's first flagship phone. It's the result of a merger between OnePlus and Oppo, which will be called "SuperVOOC" The phone is launching in China first on January 11. There's also no word on a US release date yet. The 10 Pro will have a 6.7-inch display and three cameras on the back. We don't have a price yet, but OnePlus' flagship prices have gone up every year so far, and the 9 Pro was $969.The phone will go on sale January 11 in China and January 18 in the U.S.
sample-summaries/article16.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Tencent Holdings Ltd has raised $3 billion by selling 14.5 million shares in Sea. Sea owns e-commerce firm Shopee, according to a term sheet seen by Reuters on Wednesday. Tencent said late on Tuesday it had entered into a deal to reduce its stake in the Singapore-based group to 18.7% from 21.3%. The sale comes after Tencent said last month it would divest $16.4 billion of its stakes in JD.com and Six9, weakening its ties to China's second-biggest e- commerce firm. SEA's shares fell 11.4% on Tuesday in New York to $197.8 following the divestment news.