Spaces:

Hellisotherpeople
/

DebateKG

Running

App Files Files Community

Hellisotherpeople commited on Dec 8, 2022

Commit

0723a2c

1 Parent(s): 04d9da1

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -54

app.py CHANGED Viewed

@@ -6,55 +6,63 @@ from txtai.graph import GraphFactory
 from datasets import load_dataset
 import streamlit as st
 import streamlit.components.v1 as components
 st.set_page_config(page_title="DebateKG")
 st.title("DebateKG - Automatic Policy Debate Case Creation")
-st.write("WIP, give me a few more days before reviewing!")
 st.caption("github: https://github.com/Hellisotherpeople/DebateKG")
 form = st.sidebar.form("Main Settings")
 form.header("Main Settings")
-number_of_paths = form.number_input("Enter the cutoff number of paths for all shortest path search", value = 4)
-highlight_threshold = form.number_input("Enter the minimum similarity value needed to highlight" , value = 4)
-show_extract = form.checkbox("Show extracts", value = False)
 show_abstract = form.checkbox("Show abstract", value = False)
 show_full_doc = form.checkbox("Show full doc", value = False)
-show_citation = form.checkbox("Show citation", value = False)
-rerank_word = form.text_area("Enter the word", value = "Full-Document")
-rerank_topic = form.text_area("Enter the topic", value = "Full-Document")
-form.form_submit_button("Submit")
-dataset = load_dataset("Hellisotherpeople/DebateSum", split = "train")
-seg = pysbd.Segmenter(language="en", clean=False)
-embeddings = Embeddings({
-  "path": "sentence-transformers/all-mpnet-base-v2",
-  "content": True,
-  "functions": [
-    {"name": "graph", "function": "graph.attribute"},
-  ],
-  "expressions": [
-      {"name": "topic", "expression": "graph(indexid, 'topic')"},
-      {"name": "topicrank", "expression": "graph(indexid, 'topicrank')"}
-  ],
-  "graph": {
-      "limit": 100,
-      "minscore": 0.10,
-      "topics": {
-          "terms": 4,
-          "resolution" : 100
-      }
-  }
-})
-embeddings.load("DebateSum_SemanticGraph_mpnet_extract.tar.gz")
 graph = embeddings.graph
 def david_distance(source, target, attrs):
@@ -64,13 +72,13 @@ def david_distance(source, target, attrs):
 def david_showpath(source, target, the_graph):
     return nx.shortest_path(the_graph, source, target, david_distance)
-import string
 def highlight(index, result):
   output = f"{index}. "
-  spans = [(token, score, "#fff59d" if score > 0.01 else None) for token, score in result["tokens"]]
   for token, _, color in spans:
     output += f"<span style='background-color: {color}'>{token}</span> " if color else f"{token} "
@@ -91,27 +99,37 @@ def showpath_any(list_of_arguments, strip_punctuation = True, the_graph=graph.ba
     if strip_punctuation:
         text = text.translate(str.maketrans("","", string.punctuation))
     list_of_evidence_ids.append(int(embeddings.search(f"select id from txtai where similar('{text}') limit 1")[0]['id']))
-  print(list_of_evidence_ids)
   sections = []
   for x, p in enumerate(path):
       if x == 0:
           # Print start node
           sections.append(f"{x + 1}. {p}")
-          #sections.append(dataset["Abstract"][list_of_evidence_ids[x]])
-          #sections.append(dataset["Citation"][list_of_evidence_ids[x+1]])
-          #sections.append(dataset["Full-Document"][list_of_evidence_ids[x]])
       if x < len(path) - 1:
           # Explain and highlight next path element
           results = embeddings.explain(p, [path[x + 1]], limit=1)[0]
           sections.append(highlight(x + 2, results))
-          #sections.append(dataset["Abstract"][list_of_evidence_ids[x+1]])
-          #sections.append(dataset["Citation"][list_of_evidence_ids[x+1]])
-          #sections.append(dataset["Full-Document"][list_of_evidence_ids[x+1]])
-  return components.html("<br/><br/>".join(sections), scrolling = True, width = 800, height = 1000)
 def question(text, rerank_word = "", rerank_topic = "", limit = 100):
   return embeddings.search(f"select id, text, topic, evidence_id, score from txtai where similar('{text}') and text like '%{rerank_word}%' and topic like '%{rerank_topic}%' limit {limit}")
@@ -119,11 +137,36 @@ def question(text, rerank_word = "", rerank_topic = "", limit = 100):
 query_form =  st.form("Query the Index:")
-query_form.write("Write a SQL query")
-query_form_submitted = query_form.form_submit_button("Click me to get ")
-#showpath_any([3, 12, 15])
-with st.expander("mine", expanded = False):
-    st.write(embeddings.search(f"select * from txtai where similar('you') and text like '%the%' limit 10"))

 from datasets import load_dataset
 import streamlit as st
 import streamlit.components.v1 as components
+import string
 st.set_page_config(page_title="DebateKG")
 st.title("DebateKG - Automatic Policy Debate Case Creation")
 st.caption("github: https://github.com/Hellisotherpeople/DebateKG")
 form = st.sidebar.form("Main Settings")
 form.header("Main Settings")
+highlight_threshold = form.number_input("Enter the minimum similarity value needed to highlight" , value = 0.05)
+show_extract = form.checkbox("Show extracts", value = True)
 show_abstract = form.checkbox("Show abstract", value = False)
 show_full_doc = form.checkbox("Show full doc", value = False)
+show_citation = form.checkbox("Show citation", value = True)
+rerank_word = form.text_input("(Optional) Constrain all evidence in the case to have this word within its text", value = "")
+form.caption("Doing this may create graphs which are so constrained that DebateKG can't find a valid path in the graph to build a case")
+html_window_width = form.number_input("Enter the pixel width of the output debate case window", value = 1000)
+html_window_height = form.number_input("Enter the pixel height of the output debate case window", value = 1000)
+option = form.selectbox(
+    'Which Knowledge Graph do you want to use?',
+    ('DebateSum_SemanticGraph_longformer_extract.tar.gz', 'DebateSum_SemanticGraph_longformer_abstract.tar.gz', 'DebateSum_SemanticGraph_mpnet_abstract.tar.gz', 'DebateSum_SemanticGraph_legalbert_abstract.tar.gz', 'DebateSum_SemanticGraph_legalbert_extract.tar.gz', 'DebateSum_SemanticGraph_mpnet_extract.tar.gz', 'DebateSum_SemanticGraph_mpnet_sentence.tar.gz'), index = 2)
+form.form_submit_button("Change Settings")
+@st.cache(allow_output_mutation=True)
+def load_my_dataset():
+  dataset = load_dataset("Hellisotherpeople/DebateSum", split = "train")
+  return dataset
+@st.cache(allow_output_mutation=True)
+def load_embeddings():
+  embeddings = Embeddings({
+    "path": "sentence-transformers/all-mpnet-base-v2",
+    "content": True,
+    "functions": [
+      {"name": "graph", "function": "graph.attribute"},
+    ],
+    "expressions": [
+        {"name": "topic", "expression": "graph(indexid, 'topic')"},
+        {"name": "topicrank", "expression": "graph(indexid, 'topicrank')"}
+    ],
+    "graph": {
+        "limit": 100,
+        "minscore": 0.10,
+        "topics": {
+            "terms": 4,
+            "resolution" : 100
+        }
+    }
+  })
+  embeddings.load(option)
+  return embeddings
+dataset = load_my_dataset()
+embeddings = load_embeddings()
 graph = embeddings.graph
 def david_distance(source, target, attrs):
 def david_showpath(source, target, the_graph):
     return nx.shortest_path(the_graph, source, target, david_distance)
+def david_show_all_paths(source, target, the_graph):
+    return nx.all_shortest_paths(the_graph, source, target, david_distance)
 def highlight(index, result):
   output = f"{index}. "
+  spans = [(token, score, "#fff59d" if score > highlight_threshold else None) for token, score in result["tokens"]]
   for token, _, color in spans:
     output += f"<span style='background-color: {color}'>{token}</span> " if color else f"{token} "
     if strip_punctuation:
         text = text.translate(str.maketrans("","", string.punctuation))
     list_of_evidence_ids.append(int(embeddings.search(f"select id from txtai where similar('{text}') limit 1")[0]['id']))
   sections = []
+  #sections.append(list_of_evidence_ids)
   for x, p in enumerate(path):
       if x == 0:
           # Print start node
           sections.append(f"{x + 1}. {p}")
+          if show_abstract:
+            sections.append(dataset["Abstract"][list_of_evidence_ids[x]])
+          if show_citation:
+            sections.append(dataset["Citation"][list_of_evidence_ids[x]])
+          if show_extract:
+            sections.append(dataset["Extract"][list_of_evidence_ids[x]])
+          if show_full_doc:
+            sections.append(dataset["Full-Document"][list_of_evidence_ids[x]])
       if x < len(path) - 1:
           # Explain and highlight next path element
           results = embeddings.explain(p, [path[x + 1]], limit=1)[0]
           sections.append(highlight(x + 2, results))
+          if show_abstract:
+            sections.append(dataset["Abstract"][list_of_evidence_ids[x+1]])
+          if show_citation:
+            sections.append(dataset["Citation"][list_of_evidence_ids[x+1]])
+          if show_extract:
+            sections.append(dataset["Extract"][list_of_evidence_ids[x+1]])
+          if show_full_doc:
+            sections.append(dataset["Full-Document"][list_of_evidence_ids[x+1]])
+  return components.html("<br/><br/>".join(sections), scrolling = True, width = html_window_width, height = html_window_height)
 def question(text, rerank_word = "", rerank_topic = "", limit = 100):
   return embeddings.search(f"select id, text, topic, evidence_id, score from txtai where similar('{text}') and text like '%{rerank_word}%' and topic like '%{rerank_topic}%' limit {limit}")
 query_form =  st.form("Query the Index:")
+query_form.write("Step 1: Find Arguments")
+query_form.write("Use semantic SQL from txtai to find some arguments, we use indexids to keep track of them.")
+query_form.caption("You can use the semantic SQL to explore the dataset too! The possibilities are limitless!")
+query_sql = query_form.text_area("Enter a semantic SQL statement", value = f"select topic, * from txtai where similar('Trump and US relations with China') and topic like '%trump%' and text like '%Donald%' limit 1")
+query_form_submitted = query_form.form_submit_button("Query")
+if query_form_submitted:
+  with st.expander("Output (Open Me)", expanded = False):
+    #my_path = showpath_any([170750, 50, 23])
+    #st.write(embeddings.search(f"select * from txtai where similar('you') and text like '%the%' limit 10"))
+    st.write(embeddings.search(query_sql))
+paths_form = st.form("Build the Arguments")
+paths_form.write("Step 2: Build a Policy Debate Case")
+paths_form.write("Enter any number of indexids (arguments), DebateKG will build a debate case out of it which links them all together")
+user_paths_string = paths_form.text_area("Enter a list of indexids seperated by whitespace", value = "250 10000 2405")
+user_paths_list_of_strings = user_paths_string.split()
+user_paths_list = list(map(int, user_paths_list_of_strings))
+paths_form_submitted = paths_form.form_submit_button("Build a Policy Debate Case")
+if paths_form_submitted:
+  if rerank_word:
+    selected_nodes = [n for n,v in graph.backend.nodes(data=True) if rerank_word in v['text']] ##also works for topic
+    H = graph.backend.subgraph(selected_nodes)
+    showpath_any(user_paths_list, the_graph = H)
+  else:
+    showpath_any(user_paths_list)