Victoria Slocum commited on
Commit
a997532
·
1 Parent(s): 804e6f2

Feat: Add noun chunks

Browse files
Files changed (1) hide show
  1. app.py +70 -11
app.py CHANGED
@@ -20,6 +20,7 @@ texts = {"en": DEFAULT_TEXT, "ca": "Apple està buscant comprar una startup del
20
  "pl": "Poczuł przyjemną woń mocnej kawy.", "pt": "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares", "ro": "Apple plănuiește să cumpere o companie britanică pentru un miliard de dolari", "ru": "Apple рассматривает возможность покупки стартапа из Соединённого Королевства за $1 млрд", "sv": "Apple överväger att köpa brittisk startup för 1 miljard dollar.", "zh": "作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。"}
21
 
22
  button_css = "float: right; --tw-border-opacity: 1; border-color: rgb(229 231 235 / var(--tw-border-opacity)); --tw-gradient-from: rgb(243 244 246 / 0.7); --tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to, rgb(243 244 246 / 0)); --tw-gradient-to: rgb(229 231 235 / 0.8); --tw-text-opacity: 1; color: rgb(55 65 81 / var(--tw-text-opacity)); border-width: 1px; --tw-bg-opacity: 1; background-color: rgb(255 255 255 / var(--tw-bg-opacity)); background-image: linear-gradient(to bottom right, var(--tw-gradient-stops)); display: inline-flex; flex: 1 1 0%; align-items: center; justify-content: center; --tw-shadow: 0 1px 2px 0 rgb(0 0 0 / 0.05); --tw-shadow-colored: 0 1px 2px 0 var(--tw-shadow-color); box-shadow: var(--tw-ring-offset-shadow, 0 0 #0000), var(--tw-ring-shadow, 0 0 #0000), var(--tw-shadow); -webkit-appearance: button; border-radius: 0.5rem; padding-top: 0.5rem; padding-bottom: 0.5rem; padding-left: 1rem; padding-right: 1rem; font-size: 1rem; line-height: 1.5rem; font-weight: 600;"
 
23
 
24
  def get_all_models():
25
  with open("requirements.txt") as f:
@@ -32,14 +33,17 @@ def get_all_models():
32
  models.append(model)
33
  return models
34
 
 
35
  models = get_all_models()
36
 
 
37
  def download_svg(svg):
38
  encode = base64.b64encode(bytes(svg, 'utf-8'))
39
  img = 'data:image/svg+xml;base64,' + str(encode)[2:-1]
40
  html = f'<a download="displacy.svg" href="{img}" style="{button_css}">Download as SVG</a>'
41
  return html
42
 
 
43
  def dependency(text, col_punct, col_phrase, compact, bg, font, model):
44
  model_name = model + "_sm"
45
  nlp = spacy.load(model_name)
@@ -53,7 +57,7 @@ def dependency(text, col_punct, col_phrase, compact, bg, font, model):
53
 
54
  def entity(text, ents, model):
55
  model_name = model + "_sm"
56
- nlp = spacy.load(model_name)
57
  doc = nlp(text)
58
  options = {"ents": ents}
59
  svg = displacy.render(doc, style="ent", options=options)
@@ -87,6 +91,29 @@ def default_token(text, attributes, model):
87
  return data, model_name
88
 
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  def random_vectors(text, model):
91
  model_name = model + "_md"
92
  nlp = spacy.load(model_name)
@@ -203,7 +230,8 @@ with demo:
203
  with gr.Column():
204
  gr.Markdown(" ")
205
  with gr.Column():
206
- dep_model = gr.Textbox(label="Model", value="en_core_web_sm")
 
207
  with gr.Row():
208
  with gr.Column():
209
  col_punct = gr.Checkbox(
@@ -217,14 +245,16 @@ with demo:
217
  with gr.Column():
218
  text = gr.Textbox(
219
  label="Text Color", value="black")
220
-
221
  dep_output = gr.HTML(value=dependency(
222
  DEFAULT_TEXT, True, True, False, DEFAULT_COLOR, "black", DEFAULT_MODEL)[0])
223
  with gr.Row():
224
  with gr.Column():
225
- dep_button = gr.Button("Update Dependency Parser", variant="primary")
 
226
  with gr.Column():
227
- dep_download_button = gr.HTML(value=download_svg(dep_output.value))
 
228
  gr.Markdown(" ")
229
  with gr.Box():
230
  with gr.Column():
@@ -239,12 +269,14 @@ with demo:
239
  with gr.Column():
240
  gr.Markdown(" ")
241
  with gr.Column():
242
- ent_model = gr.Textbox(label="Model", value="en_core_web_sm")
 
243
  ent_input = gr.CheckboxGroup(
244
  DEFAULT_ENTS, value=DEFAULT_ENTS)
245
  ent_output = gr.HTML(value=entity(
246
  DEFAULT_TEXT, DEFAULT_ENTS, DEFAULT_MODEL)[0])
247
- ent_button = gr.Button("Update Entity Recognizer", variant="primary")
 
248
  with gr.Box():
249
  with gr.Column():
250
  with gr.Row():
@@ -258,7 +290,8 @@ with demo:
258
  with gr.Column():
259
  gr.Markdown(" ")
260
  with gr.Column():
261
- tok_model = gr.Textbox(label="Model", value="en_core_web_sm")
 
262
  with gr.Row():
263
  with gr.Column():
264
  tok_input = gr.CheckboxGroup(
@@ -267,7 +300,27 @@ with demo:
267
  gr.Markdown("")
268
  tok_output = gr.Dataframe(headers=DEFAULT_TOK_ATTR, value=default_token(
269
  DEFAULT_TEXT, DEFAULT_TOK_ATTR, DEFAULT_MODEL)[0], overflow_row_behaviour="paginate")
270
- tok_button = gr.Button("Update Token Properties", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  with gr.Box():
272
  with gr.Column():
273
  with gr.Row():
@@ -281,7 +334,8 @@ with demo:
281
  with gr.Column():
282
  gr.Markdown(" ")
283
  with gr.Column():
284
- sim_model = gr.Textbox(label="Model", value="en_core_web_md")
 
285
  with gr.Row():
286
  with gr.Column():
287
  sim_text1 = gr.Textbox(
@@ -309,7 +363,8 @@ with demo:
309
  with gr.Column():
310
  gr.Markdown(" ")
311
  with gr.Column():
312
- span_model = gr.Textbox(label="Model", value="en_core_web_sm")
 
313
  with gr.Row():
314
  with gr.Column():
315
  span1 = gr.Textbox(
@@ -341,6 +396,8 @@ with demo:
341
  text_input, col_punct, col_phrase, compact, bg, text, model_input], outputs=[dep_output, dep_download_button, dep_model])
342
  button.click(
343
  entity, inputs=[text_input, ent_input, model_input], outputs=[ent_output, ent_model])
 
 
344
  button.click(
345
  token, inputs=[text_input, tok_input, model_input], outputs=[tok_output, tok_model])
346
  button.click(vectors, inputs=[sim_text1,
@@ -353,6 +410,8 @@ with demo:
353
  entity, inputs=[text_input, ent_input, model_input], outputs=[ent_output, ent_model])
354
  tok_button.click(
355
  token, inputs=[text_input, tok_input, model_input], outputs=[tok_output, tok_model])
 
 
356
  sim_button.click(vectors, inputs=[
357
  sim_text1, sim_text2, model_input], outputs=[sim_output, sim_model])
358
  span_button.click(
 
20
  "pl": "Poczuł przyjemną woń mocnej kawy.", "pt": "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares", "ro": "Apple plănuiește să cumpere o companie britanică pentru un miliard de dolari", "ru": "Apple рассматривает возможность покупки стартапа из Соединённого Королевства за $1 млрд", "sv": "Apple överväger att köpa brittisk startup för 1 miljard dollar.", "zh": "作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。"}
21
 
22
  button_css = "float: right; --tw-border-opacity: 1; border-color: rgb(229 231 235 / var(--tw-border-opacity)); --tw-gradient-from: rgb(243 244 246 / 0.7); --tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to, rgb(243 244 246 / 0)); --tw-gradient-to: rgb(229 231 235 / 0.8); --tw-text-opacity: 1; color: rgb(55 65 81 / var(--tw-text-opacity)); border-width: 1px; --tw-bg-opacity: 1; background-color: rgb(255 255 255 / var(--tw-bg-opacity)); background-image: linear-gradient(to bottom right, var(--tw-gradient-stops)); display: inline-flex; flex: 1 1 0%; align-items: center; justify-content: center; --tw-shadow: 0 1px 2px 0 rgb(0 0 0 / 0.05); --tw-shadow-colored: 0 1px 2px 0 var(--tw-shadow-color); box-shadow: var(--tw-ring-offset-shadow, 0 0 #0000), var(--tw-ring-shadow, 0 0 #0000), var(--tw-shadow); -webkit-appearance: button; border-radius: 0.5rem; padding-top: 0.5rem; padding-bottom: 0.5rem; padding-left: 1rem; padding-right: 1rem; font-size: 1rem; line-height: 1.5rem; font-weight: 600;"
23
+ NOUN_ATTR = ['text', 'root.text', 'root.dep_', 'root.head.text']
24
 
25
  def get_all_models():
26
  with open("requirements.txt") as f:
 
33
  models.append(model)
34
  return models
35
 
36
+
37
  models = get_all_models()
38
 
39
+
40
  def download_svg(svg):
41
  encode = base64.b64encode(bytes(svg, 'utf-8'))
42
  img = 'data:image/svg+xml;base64,' + str(encode)[2:-1]
43
  html = f'<a download="displacy.svg" href="{img}" style="{button_css}">Download as SVG</a>'
44
  return html
45
 
46
+
47
  def dependency(text, col_punct, col_phrase, compact, bg, font, model):
48
  model_name = model + "_sm"
49
  nlp = spacy.load(model_name)
 
57
 
58
  def entity(text, ents, model):
59
  model_name = model + "_sm"
60
+ nlp = spacy.load(model_name)
61
  doc = nlp(text)
62
  options = {"ents": ents}
63
  svg = displacy.render(doc, style="ent", options=options)
 
91
  return data, model_name
92
 
93
 
94
+ def noun_chunks(text, model):
95
+ model_name = model + "_sm"
96
+ nlp = spacy.load(model_name)
97
+ data = []
98
+ doc = nlp(text)
99
+ for chunk in doc.noun_chunks:
100
+ data.append([chunk.text, chunk.root.text, chunk.root.dep_,
101
+ chunk.root.head.text])
102
+ data = pd.DataFrame(data, columns=NOUN_ATTR)
103
+ return data, model_name
104
+
105
+
106
+ def default_noun_chunks(text, model):
107
+ model_name = model + "_sm"
108
+ nlp = spacy.load(model_name)
109
+ data = []
110
+ doc = nlp(text)
111
+ for chunk in doc.noun_chunks:
112
+ data.append([chunk.text, chunk.root.text, chunk.root.dep_,
113
+ chunk.root.head.text])
114
+ return data, model_name
115
+
116
+
117
  def random_vectors(text, model):
118
  model_name = model + "_md"
119
  nlp = spacy.load(model_name)
 
230
  with gr.Column():
231
  gr.Markdown(" ")
232
  with gr.Column():
233
+ dep_model = gr.Textbox(
234
+ label="Model", value="en_core_web_sm")
235
  with gr.Row():
236
  with gr.Column():
237
  col_punct = gr.Checkbox(
 
245
  with gr.Column():
246
  text = gr.Textbox(
247
  label="Text Color", value="black")
248
+
249
  dep_output = gr.HTML(value=dependency(
250
  DEFAULT_TEXT, True, True, False, DEFAULT_COLOR, "black", DEFAULT_MODEL)[0])
251
  with gr.Row():
252
  with gr.Column():
253
+ dep_button = gr.Button(
254
+ "Update Dependency Parser", variant="primary")
255
  with gr.Column():
256
+ dep_download_button = gr.HTML(
257
+ value=download_svg(dep_output.value))
258
  gr.Markdown(" ")
259
  with gr.Box():
260
  with gr.Column():
 
269
  with gr.Column():
270
  gr.Markdown(" ")
271
  with gr.Column():
272
+ ent_model = gr.Textbox(
273
+ label="Model", value="en_core_web_sm")
274
  ent_input = gr.CheckboxGroup(
275
  DEFAULT_ENTS, value=DEFAULT_ENTS)
276
  ent_output = gr.HTML(value=entity(
277
  DEFAULT_TEXT, DEFAULT_ENTS, DEFAULT_MODEL)[0])
278
+ ent_button = gr.Button(
279
+ "Update Entity Recognizer", variant="primary")
280
  with gr.Box():
281
  with gr.Column():
282
  with gr.Row():
 
290
  with gr.Column():
291
  gr.Markdown(" ")
292
  with gr.Column():
293
+ tok_model = gr.Textbox(
294
+ label="Model", value="en_core_web_sm")
295
  with gr.Row():
296
  with gr.Column():
297
  tok_input = gr.CheckboxGroup(
 
300
  gr.Markdown("")
301
  tok_output = gr.Dataframe(headers=DEFAULT_TOK_ATTR, value=default_token(
302
  DEFAULT_TEXT, DEFAULT_TOK_ATTR, DEFAULT_MODEL)[0], overflow_row_behaviour="paginate")
303
+ tok_button = gr.Button(
304
+ "Update Token Properties", variant="primary")
305
+ with gr.Box():
306
+ with gr.Column():
307
+ with gr.Row():
308
+ with gr.Column():
309
+ gr.Markdown(
310
+ "## [🔗 Noun chunks](https://spacy.io/usage/linguistic-feature#noun-chunks)")
311
+ gr.Markdown(
312
+ "You can use `doc.noun_chunks` to extract noun phrases from a doc object")
313
+ with gr.Column():
314
+ with gr.Row():
315
+ with gr.Column():
316
+ gr.Markdown(" ")
317
+ with gr.Column():
318
+ noun_model = gr.Textbox(
319
+ label="Model", value="en_core_web_sm")
320
+ noun_output = gr.Dataframe(headers=NOUN_ATTR, value=default_noun_chunks(
321
+ DEFAULT_TEXT, DEFAULT_MODEL)[0], overflow_row_behaviour="paginate")
322
+ noun_button = gr.Button(
323
+ "Update Noun Chunks", variant="primary")
324
  with gr.Box():
325
  with gr.Column():
326
  with gr.Row():
 
334
  with gr.Column():
335
  gr.Markdown(" ")
336
  with gr.Column():
337
+ sim_model = gr.Textbox(
338
+ label="Model", value="en_core_web_md")
339
  with gr.Row():
340
  with gr.Column():
341
  sim_text1 = gr.Textbox(
 
363
  with gr.Column():
364
  gr.Markdown(" ")
365
  with gr.Column():
366
+ span_model = gr.Textbox(
367
+ label="Model", value="en_core_web_sm")
368
  with gr.Row():
369
  with gr.Column():
370
  span1 = gr.Textbox(
 
396
  text_input, col_punct, col_phrase, compact, bg, text, model_input], outputs=[dep_output, dep_download_button, dep_model])
397
  button.click(
398
  entity, inputs=[text_input, ent_input, model_input], outputs=[ent_output, ent_model])
399
+ button.click(
400
+ noun_chunks, inputs=[text_input, model_input], outputs=[noun_output, noun_model])
401
  button.click(
402
  token, inputs=[text_input, tok_input, model_input], outputs=[tok_output, tok_model])
403
  button.click(vectors, inputs=[sim_text1,
 
410
  entity, inputs=[text_input, ent_input, model_input], outputs=[ent_output, ent_model])
411
  tok_button.click(
412
  token, inputs=[text_input, tok_input, model_input], outputs=[tok_output, tok_model])
413
+ noun_button.click(
414
+ noun_chunks, inputs=[text_input, model_input], outputs=[noun_output, noun_model])
415
  sim_button.click(vectors, inputs=[
416
  sim_text1, sim_text2, model_input], outputs=[sim_output, sim_model])
417
  span_button.click(