Hellisotherpeople commited on
Commit
0723a2c
·
1 Parent(s): 04d9da1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -54
app.py CHANGED
@@ -6,55 +6,63 @@ from txtai.graph import GraphFactory
6
  from datasets import load_dataset
7
  import streamlit as st
8
  import streamlit.components.v1 as components
9
-
10
 
11
 
12
  st.set_page_config(page_title="DebateKG")
13
  st.title("DebateKG - Automatic Policy Debate Case Creation")
14
- st.write("WIP, give me a few more days before reviewing!")
15
  st.caption("github: https://github.com/Hellisotherpeople/DebateKG")
16
 
17
 
18
  form = st.sidebar.form("Main Settings")
19
  form.header("Main Settings")
20
- number_of_paths = form.number_input("Enter the cutoff number of paths for all shortest path search", value = 4)
21
- highlight_threshold = form.number_input("Enter the minimum similarity value needed to highlight" , value = 4)
22
- show_extract = form.checkbox("Show extracts", value = False)
23
  show_abstract = form.checkbox("Show abstract", value = False)
24
  show_full_doc = form.checkbox("Show full doc", value = False)
25
- show_citation = form.checkbox("Show citation", value = False)
26
- rerank_word = form.text_area("Enter the word", value = "Full-Document")
27
- rerank_topic = form.text_area("Enter the topic", value = "Full-Document")
28
-
29
- form.form_submit_button("Submit")
30
-
31
-
32
-
33
- dataset = load_dataset("Hellisotherpeople/DebateSum", split = "train")
34
- seg = pysbd.Segmenter(language="en", clean=False)
35
-
36
-
37
- embeddings = Embeddings({
38
- "path": "sentence-transformers/all-mpnet-base-v2",
39
- "content": True,
40
- "functions": [
41
- {"name": "graph", "function": "graph.attribute"},
42
- ],
43
- "expressions": [
44
- {"name": "topic", "expression": "graph(indexid, 'topic')"},
45
- {"name": "topicrank", "expression": "graph(indexid, 'topicrank')"}
46
- ],
47
- "graph": {
48
- "limit": 100,
49
- "minscore": 0.10,
50
- "topics": {
51
- "terms": 4,
52
- "resolution" : 100
53
- }
54
- }
55
- })
56
-
57
- embeddings.load("DebateSum_SemanticGraph_mpnet_extract.tar.gz")
 
 
 
 
 
 
 
 
 
 
58
  graph = embeddings.graph
59
 
60
  def david_distance(source, target, attrs):
@@ -64,13 +72,13 @@ def david_distance(source, target, attrs):
64
  def david_showpath(source, target, the_graph):
65
  return nx.shortest_path(the_graph, source, target, david_distance)
66
 
 
 
67
 
68
 
69
- import string
70
-
71
  def highlight(index, result):
72
  output = f"{index}. "
73
- spans = [(token, score, "#fff59d" if score > 0.01 else None) for token, score in result["tokens"]]
74
 
75
  for token, _, color in spans:
76
  output += f"<span style='background-color: {color}'>{token}</span> " if color else f"{token} "
@@ -91,27 +99,37 @@ def showpath_any(list_of_arguments, strip_punctuation = True, the_graph=graph.ba
91
  if strip_punctuation:
92
  text = text.translate(str.maketrans("","", string.punctuation))
93
  list_of_evidence_ids.append(int(embeddings.search(f"select id from txtai where similar('{text}') limit 1")[0]['id']))
94
- print(list_of_evidence_ids)
95
 
96
  sections = []
 
97
  for x, p in enumerate(path):
98
  if x == 0:
99
  # Print start node
100
 
101
  sections.append(f"{x + 1}. {p}")
102
- #sections.append(dataset["Abstract"][list_of_evidence_ids[x]])
103
- #sections.append(dataset["Citation"][list_of_evidence_ids[x+1]])
104
- #sections.append(dataset["Full-Document"][list_of_evidence_ids[x]])
 
 
 
 
 
105
 
106
  if x < len(path) - 1:
107
  # Explain and highlight next path element
108
  results = embeddings.explain(p, [path[x + 1]], limit=1)[0]
109
  sections.append(highlight(x + 2, results))
110
- #sections.append(dataset["Abstract"][list_of_evidence_ids[x+1]])
111
- #sections.append(dataset["Citation"][list_of_evidence_ids[x+1]])
112
- #sections.append(dataset["Full-Document"][list_of_evidence_ids[x+1]])
 
 
 
 
 
113
 
114
- return components.html("<br/><br/>".join(sections), scrolling = True, width = 800, height = 1000)
115
 
116
  def question(text, rerank_word = "", rerank_topic = "", limit = 100):
117
  return embeddings.search(f"select id, text, topic, evidence_id, score from txtai where similar('{text}') and text like '%{rerank_word}%' and topic like '%{rerank_topic}%' limit {limit}")
@@ -119,11 +137,36 @@ def question(text, rerank_word = "", rerank_topic = "", limit = 100):
119
 
120
 
121
  query_form = st.form("Query the Index:")
122
- query_form.write("Write a SQL query")
123
- query_form_submitted = query_form.form_submit_button("Click me to get ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
 
126
- #showpath_any([3, 12, 15])
127
 
128
- with st.expander("mine", expanded = False):
129
- st.write(embeddings.search(f"select * from txtai where similar('you') and text like '%the%' limit 10"))
 
6
  from datasets import load_dataset
7
  import streamlit as st
8
  import streamlit.components.v1 as components
9
+ import string
10
 
11
 
12
  st.set_page_config(page_title="DebateKG")
13
  st.title("DebateKG - Automatic Policy Debate Case Creation")
 
14
  st.caption("github: https://github.com/Hellisotherpeople/DebateKG")
15
 
16
 
17
  form = st.sidebar.form("Main Settings")
18
  form.header("Main Settings")
19
+ highlight_threshold = form.number_input("Enter the minimum similarity value needed to highlight" , value = 0.05)
20
+ show_extract = form.checkbox("Show extracts", value = True)
 
21
  show_abstract = form.checkbox("Show abstract", value = False)
22
  show_full_doc = form.checkbox("Show full doc", value = False)
23
+ show_citation = form.checkbox("Show citation", value = True)
24
+ rerank_word = form.text_input("(Optional) Constrain all evidence in the case to have this word within its text", value = "")
25
+ form.caption("Doing this may create graphs which are so constrained that DebateKG can't find a valid path in the graph to build a case")
26
+ html_window_width = form.number_input("Enter the pixel width of the output debate case window", value = 1000)
27
+ html_window_height = form.number_input("Enter the pixel height of the output debate case window", value = 1000)
28
+ option = form.selectbox(
29
+ 'Which Knowledge Graph do you want to use?',
30
+ ('DebateSum_SemanticGraph_longformer_extract.tar.gz', 'DebateSum_SemanticGraph_longformer_abstract.tar.gz', 'DebateSum_SemanticGraph_mpnet_abstract.tar.gz', 'DebateSum_SemanticGraph_legalbert_abstract.tar.gz', 'DebateSum_SemanticGraph_legalbert_extract.tar.gz', 'DebateSum_SemanticGraph_mpnet_extract.tar.gz', 'DebateSum_SemanticGraph_mpnet_sentence.tar.gz'), index = 2)
31
+
32
+ form.form_submit_button("Change Settings")
33
+
34
+ @st.cache(allow_output_mutation=True)
35
+ def load_my_dataset():
36
+ dataset = load_dataset("Hellisotherpeople/DebateSum", split = "train")
37
+ return dataset
38
+
39
+ @st.cache(allow_output_mutation=True)
40
+ def load_embeddings():
41
+ embeddings = Embeddings({
42
+ "path": "sentence-transformers/all-mpnet-base-v2",
43
+ "content": True,
44
+ "functions": [
45
+ {"name": "graph", "function": "graph.attribute"},
46
+ ],
47
+ "expressions": [
48
+ {"name": "topic", "expression": "graph(indexid, 'topic')"},
49
+ {"name": "topicrank", "expression": "graph(indexid, 'topicrank')"}
50
+ ],
51
+ "graph": {
52
+ "limit": 100,
53
+ "minscore": 0.10,
54
+ "topics": {
55
+ "terms": 4,
56
+ "resolution" : 100
57
+ }
58
+ }
59
+ })
60
+ embeddings.load(option)
61
+ return embeddings
62
+
63
+ dataset = load_my_dataset()
64
+ embeddings = load_embeddings()
65
+
66
  graph = embeddings.graph
67
 
68
  def david_distance(source, target, attrs):
 
72
  def david_showpath(source, target, the_graph):
73
  return nx.shortest_path(the_graph, source, target, david_distance)
74
 
75
+ def david_show_all_paths(source, target, the_graph):
76
+ return nx.all_shortest_paths(the_graph, source, target, david_distance)
77
 
78
 
 
 
79
  def highlight(index, result):
80
  output = f"{index}. "
81
+ spans = [(token, score, "#fff59d" if score > highlight_threshold else None) for token, score in result["tokens"]]
82
 
83
  for token, _, color in spans:
84
  output += f"<span style='background-color: {color}'>{token}</span> " if color else f"{token} "
 
99
  if strip_punctuation:
100
  text = text.translate(str.maketrans("","", string.punctuation))
101
  list_of_evidence_ids.append(int(embeddings.search(f"select id from txtai where similar('{text}') limit 1")[0]['id']))
 
102
 
103
  sections = []
104
+ #sections.append(list_of_evidence_ids)
105
  for x, p in enumerate(path):
106
  if x == 0:
107
  # Print start node
108
 
109
  sections.append(f"{x + 1}. {p}")
110
+ if show_abstract:
111
+ sections.append(dataset["Abstract"][list_of_evidence_ids[x]])
112
+ if show_citation:
113
+ sections.append(dataset["Citation"][list_of_evidence_ids[x]])
114
+ if show_extract:
115
+ sections.append(dataset["Extract"][list_of_evidence_ids[x]])
116
+ if show_full_doc:
117
+ sections.append(dataset["Full-Document"][list_of_evidence_ids[x]])
118
 
119
  if x < len(path) - 1:
120
  # Explain and highlight next path element
121
  results = embeddings.explain(p, [path[x + 1]], limit=1)[0]
122
  sections.append(highlight(x + 2, results))
123
+ if show_abstract:
124
+ sections.append(dataset["Abstract"][list_of_evidence_ids[x+1]])
125
+ if show_citation:
126
+ sections.append(dataset["Citation"][list_of_evidence_ids[x+1]])
127
+ if show_extract:
128
+ sections.append(dataset["Extract"][list_of_evidence_ids[x+1]])
129
+ if show_full_doc:
130
+ sections.append(dataset["Full-Document"][list_of_evidence_ids[x+1]])
131
 
132
+ return components.html("<br/><br/>".join(sections), scrolling = True, width = html_window_width, height = html_window_height)
133
 
134
  def question(text, rerank_word = "", rerank_topic = "", limit = 100):
135
  return embeddings.search(f"select id, text, topic, evidence_id, score from txtai where similar('{text}') and text like '%{rerank_word}%' and topic like '%{rerank_topic}%' limit {limit}")
 
137
 
138
 
139
  query_form = st.form("Query the Index:")
140
+ query_form.write("Step 1: Find Arguments")
141
+ query_form.write("Use semantic SQL from txtai to find some arguments, we use indexids to keep track of them.")
142
+ query_form.caption("You can use the semantic SQL to explore the dataset too! The possibilities are limitless!")
143
+ query_sql = query_form.text_area("Enter a semantic SQL statement", value = f"select topic, * from txtai where similar('Trump and US relations with China') and topic like '%trump%' and text like '%Donald%' limit 1")
144
+
145
+ query_form_submitted = query_form.form_submit_button("Query")
146
+
147
+ if query_form_submitted:
148
+ with st.expander("Output (Open Me)", expanded = False):
149
+ #my_path = showpath_any([170750, 50, 23])
150
+ #st.write(embeddings.search(f"select * from txtai where similar('you') and text like '%the%' limit 10"))
151
+ st.write(embeddings.search(query_sql))
152
+
153
+
154
+ paths_form = st.form("Build the Arguments")
155
+ paths_form.write("Step 2: Build a Policy Debate Case")
156
+ paths_form.write("Enter any number of indexids (arguments), DebateKG will build a debate case out of it which links them all together")
157
+ user_paths_string = paths_form.text_area("Enter a list of indexids seperated by whitespace", value = "250 10000 2405")
158
+ user_paths_list_of_strings = user_paths_string.split()
159
+ user_paths_list = list(map(int, user_paths_list_of_strings))
160
+
161
+ paths_form_submitted = paths_form.form_submit_button("Build a Policy Debate Case")
162
+
163
+ if paths_form_submitted:
164
+ if rerank_word:
165
+ selected_nodes = [n for n,v in graph.backend.nodes(data=True) if rerank_word in v['text']] ##also works for topic
166
+ H = graph.backend.subgraph(selected_nodes)
167
+ showpath_any(user_paths_list, the_graph = H)
168
+ else:
169
+ showpath_any(user_paths_list)
170
 
171
 
 
172