Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- __pycache__/LLM_usage.cpython-310.pyc +0 -0
- __pycache__/prompt.cpython-310.pyc +0 -0
- __pycache__/retriever.cpython-310.pyc +0 -0
- __pycache__/tokenizing.cpython-310.pyc +0 -0
- app.py +11 -7
- requirements.txt +3 -3
- retriever.py +3 -4
- test_docs.py +9 -7
__pycache__/LLM_usage.cpython-310.pyc
ADDED
|
Binary file (1.78 kB). View file
|
|
|
__pycache__/prompt.cpython-310.pyc
ADDED
|
Binary file (653 Bytes). View file
|
|
|
__pycache__/retriever.cpython-310.pyc
ADDED
|
Binary file (4.88 kB). View file
|
|
|
__pycache__/tokenizing.cpython-310.pyc
ADDED
|
Binary file (1.28 kB). View file
|
|
|
app.py
CHANGED
|
@@ -19,7 +19,9 @@ def initialize_bot(api_key):
|
|
| 19 |
# Set the API key
|
| 20 |
os.environ['GROQ_API_KEY'] = api_key
|
| 21 |
|
| 22 |
-
pathes = ["./Data/hate_speech_processed.json", "./Data/reddit_jokes2_processed.json",
|
|
|
|
|
|
|
| 23 |
|
| 24 |
# Load documents (done once)
|
| 25 |
if not docs: # Only load if docs are not already loaded
|
|
@@ -70,7 +72,11 @@ setup_demo = gr.Interface(
|
|
| 70 |
inputs=[gr.Textbox(label="Enter your GROQ API Key")],
|
| 71 |
outputs=[gr.Textbox(label="Setup Status")],
|
| 72 |
title="Setup Joke Generator",
|
| 73 |
-
description="Initialize the Joke Generator Bot by providing the GROQ API key.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
)
|
| 75 |
|
| 76 |
regime_options = ["BM25 Only", "Semantic Only", "Scores Combination"]
|
|
@@ -88,9 +94,8 @@ joke_demo = gr.Interface(
|
|
| 88 |
title="Joke Generator",
|
| 89 |
description="Generate jokes based on your input message(Only in English :( )). Select a retrieval regime and view the context used.\
|
| 90 |
Be careful, the jokes can be offensive! Try to write a message that is related to the joke you want to hear.\
|
| 91 |
-
(tell me a joke and its title about... or tell me a
|
| 92 |
-
In this case, try to rewrite a message and send again
|
| 93 |
-
the link again, after reinitialize joke generator with API KEY.\
|
| 94 |
Or try to change the regime or BM25 Coefficient.\
|
| 95 |
BM25 Coefficient is used to balance the BM25 and semantic scores(It is active only in Scores Combination mode). Semantic scores are multiplied by (1 - BM25 Coefficient).\
|
| 96 |
If you want to use only BM25 or semantic scores, select the corresponding regime or set it to 0.0 or 1.0. respectively.",
|
|
@@ -105,6 +110,5 @@ demo = gr.TabbedInterface(
|
|
| 105 |
)
|
| 106 |
|
| 107 |
# Launch the interface
|
| 108 |
-
# demo.launch()
|
| 109 |
-
# demo.launch(share=True)
|
| 110 |
demo.launch()
|
|
|
|
|
|
| 19 |
# Set the API key
|
| 20 |
os.environ['GROQ_API_KEY'] = api_key
|
| 21 |
|
| 22 |
+
pathes = ["./Data/hate_speech_processed.json", "./Data/reddit_jokes2_processed.json",
|
| 23 |
+
"./Data/stupidstuff_processed.json", "./Data/wocka_processed.json",
|
| 24 |
+
"./Data/reddit_jokes1_processed.json"]
|
| 25 |
|
| 26 |
# Load documents (done once)
|
| 27 |
if not docs: # Only load if docs are not already loaded
|
|
|
|
| 72 |
inputs=[gr.Textbox(label="Enter your GROQ API Key")],
|
| 73 |
outputs=[gr.Textbox(label="Setup Status")],
|
| 74 |
title="Setup Joke Generator",
|
| 75 |
+
description="Initialize the Joke Generator Bot by providing the GROQ API key. \
|
| 76 |
+
(If there is a connection error(on this or next tab) reload the page, wait 5-10 imnutes, \
|
| 77 |
+
reload the page again and reinitialize the joke generator with the API KEY)\
|
| 78 |
+
If you see some runtime error like memory limit exceeded, tell me on mail: [email protected](I can see your email not so fast)\
|
| 79 |
+
or tg: @Beav3rrr and I will redeploy or turn on new instance",
|
| 80 |
)
|
| 81 |
|
| 82 |
regime_options = ["BM25 Only", "Semantic Only", "Scores Combination"]
|
|
|
|
| 94 |
title="Joke Generator",
|
| 95 |
description="Generate jokes based on your input message(Only in English :( )). Select a retrieval regime and view the context used.\
|
| 96 |
Be careful, the jokes can be offensive! Try to write a message that is related to the joke you want to hear.\
|
| 97 |
+
(tell me a joke and its title about... or tell me a joke and its title about... it should be a oneliner, dark, pervy, etc.). Sometimes bot works bad :(\
|
| 98 |
+
In this case, try to rewrite a message and send again.\
|
|
|
|
| 99 |
Or try to change the regime or BM25 Coefficient.\
|
| 100 |
BM25 Coefficient is used to balance the BM25 and semantic scores(It is active only in Scores Combination mode). Semantic scores are multiplied by (1 - BM25 Coefficient).\
|
| 101 |
If you want to use only BM25 or semantic scores, select the corresponding regime or set it to 0.0 or 1.0. respectively.",
|
|
|
|
| 110 |
)
|
| 111 |
|
| 112 |
# Launch the interface
|
|
|
|
|
|
|
| 113 |
demo.launch()
|
| 114 |
+
# demo.launch(share=True)
|
requirements.txt
CHANGED
|
@@ -18,7 +18,7 @@ distro==1.9.0
|
|
| 18 |
docopt==0.6.2
|
| 19 |
exceptiongroup==1.2.2
|
| 20 |
executing==2.1.0
|
| 21 |
-
fastapi==0.115.5
|
| 22 |
ffmpy==0.4.0
|
| 23 |
filelock==3.16.1
|
| 24 |
frozenlist==1.5.0
|
|
@@ -108,7 +108,7 @@ sentence-transformers==3.3.1
|
|
| 108 |
shellingham==1.5.4
|
| 109 |
six==1.16.0
|
| 110 |
sniffio==1.3.1
|
| 111 |
-
stack-data==0.6.
|
| 112 |
starlette==0.41.3
|
| 113 |
sympy==1.13.3
|
| 114 |
threadpoolctl==3.5.0
|
|
@@ -130,5 +130,5 @@ uvicorn==0.32.0
|
|
| 130 |
watchdog==5.0.3
|
| 131 |
wcwidth==0.2.13
|
| 132 |
websockets==12.0
|
| 133 |
-
yarl==1.17.2
|
| 134 |
zipp==3.21.0
|
|
|
|
| 18 |
docopt==0.6.2
|
| 19 |
exceptiongroup==1.2.2
|
| 20 |
executing==2.1.0
|
| 21 |
+
fastapi==0.115.5
|
| 22 |
ffmpy==0.4.0
|
| 23 |
filelock==3.16.1
|
| 24 |
frozenlist==1.5.0
|
|
|
|
| 108 |
shellingham==1.5.4
|
| 109 |
six==1.16.0
|
| 110 |
sniffio==1.3.1
|
| 111 |
+
stack-data==0.6.3
|
| 112 |
starlette==0.41.3
|
| 113 |
sympy==1.13.3
|
| 114 |
threadpoolctl==3.5.0
|
|
|
|
| 130 |
watchdog==5.0.3
|
| 131 |
wcwidth==0.2.13
|
| 132 |
websockets==12.0
|
| 133 |
+
yarl==1.17.2
|
| 134 |
zipp==3.21.0
|
retriever.py
CHANGED
|
@@ -92,6 +92,9 @@ class Retriever:
|
|
| 92 |
# In case of BM25 only, return the top n documents based on BM25 scores, if somebody sets a couple
|
| 93 |
# of flags to True, the func will return the top n documents based on the first flag set to True
|
| 94 |
|
|
|
|
|
|
|
|
|
|
| 95 |
if bm25_only:
|
| 96 |
semantic_only = False
|
| 97 |
scores_combination = False
|
|
@@ -112,10 +115,6 @@ class Retriever:
|
|
| 112 |
# Sort the documents by their BM25 scores in descending order
|
| 113 |
sorted_doc_indices = np.argsort(scores)
|
| 114 |
|
| 115 |
-
print("Score:", scores[sorted_doc_indices[-1]] )
|
| 116 |
-
print(self.docs[sorted_doc_indices[-1]])
|
| 117 |
-
print("Doc number:", sorted_doc_indices[-1])
|
| 118 |
-
|
| 119 |
result_docs = [self.docs[i] for i in sorted_doc_indices[-n:] if scores[i] > 0]
|
| 120 |
|
| 121 |
return result_docs[::-1] # Return the top n documents in descending order which means the most relevant documents are first
|
|
|
|
| 92 |
# In case of BM25 only, return the top n documents based on BM25 scores, if somebody sets a couple
|
| 93 |
# of flags to True, the func will return the top n documents based on the first flag set to True
|
| 94 |
|
| 95 |
+
# remove "tell me a joke about" ot "tell me a joke and its title about" from the user message
|
| 96 |
+
user_message = user_message.replace("tell me a joke about", "").replace("tell me a joke and its title about", "")
|
| 97 |
+
|
| 98 |
if bm25_only:
|
| 99 |
semantic_only = False
|
| 100 |
scores_combination = False
|
|
|
|
| 115 |
# Sort the documents by their BM25 scores in descending order
|
| 116 |
sorted_doc_indices = np.argsort(scores)
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
result_docs = [self.docs[i] for i in sorted_doc_indices[-n:] if scores[i] > 0]
|
| 119 |
|
| 120 |
return result_docs[::-1] # Return the top n documents in descending order which means the most relevant documents are first
|
test_docs.py
CHANGED
|
@@ -15,17 +15,18 @@ tokenized_docs_path = os.path.join(base_path, "tokenized_docs.pkl")
|
|
| 15 |
|
| 16 |
# Take all json files with names that end '_processed'
|
| 17 |
for path in glob.glob(f"{base_path}/*_processed.json"):
|
|
|
|
| 18 |
with open(path, 'r') as f:
|
| 19 |
docs.extend(json.load(f))
|
| 20 |
|
| 21 |
index = 0
|
| 22 |
|
| 23 |
-
for i, doc in enumerate(docs):
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
|
| 30 |
with open(bm25_path, 'rb') as f:
|
| 31 |
bm25 = pickle.load(f)
|
|
@@ -39,7 +40,7 @@ with open(bm25_path, 'rb') as f:
|
|
| 39 |
# with open(bm25_path, 'wb') as f:
|
| 40 |
# pickle.dump(bm25, f)
|
| 41 |
|
| 42 |
-
message = "tell me a joke about
|
| 43 |
tokenized_message = tokenize_text(message)
|
| 44 |
print(tokenized_message)
|
| 45 |
scores = torch.tensor(bm25.get_scores(tokenized_message))
|
|
@@ -48,6 +49,7 @@ sorted_doc_indices = np.argsort(scores)
|
|
| 48 |
for i in range(1, 2):
|
| 49 |
print("Score:", scores[sorted_doc_indices[-i]] )
|
| 50 |
print(docs[sorted_doc_indices[-i]])
|
|
|
|
| 51 |
|
| 52 |
# result_docs = [docs[i] for i in sorted_doc_indices[-30:] if scores[i] > 0]
|
| 53 |
|
|
|
|
| 15 |
|
| 16 |
# Take all json files with names that end '_processed'
|
| 17 |
for path in glob.glob(f"{base_path}/*_processed.json"):
|
| 18 |
+
print(path)
|
| 19 |
with open(path, 'r') as f:
|
| 20 |
docs.extend(json.load(f))
|
| 21 |
|
| 22 |
index = 0
|
| 23 |
|
| 24 |
+
# for i, doc in enumerate(docs):
|
| 25 |
+
# if 'body' in doc:
|
| 26 |
+
# if doc['body'] == "I don't fuck the sandwich before eating it":
|
| 27 |
+
# tokenized_doc = tokenize_doc(doc)
|
| 28 |
+
# print(tokenized_doc)
|
| 29 |
+
# index = i
|
| 30 |
|
| 31 |
with open(bm25_path, 'rb') as f:
|
| 32 |
bm25 = pickle.load(f)
|
|
|
|
| 40 |
# with open(bm25_path, 'wb') as f:
|
| 41 |
# pickle.dump(bm25, f)
|
| 42 |
|
| 43 |
+
message = "tell me a joke about sandwich before eating it"
|
| 44 |
tokenized_message = tokenize_text(message)
|
| 45 |
print(tokenized_message)
|
| 46 |
scores = torch.tensor(bm25.get_scores(tokenized_message))
|
|
|
|
| 49 |
for i in range(1, 2):
|
| 50 |
print("Score:", scores[sorted_doc_indices[-i]] )
|
| 51 |
print(docs[sorted_doc_indices[-i]])
|
| 52 |
+
print("Doc number:", sorted_doc_indices[-i])
|
| 53 |
|
| 54 |
# result_docs = [docs[i] for i in sorted_doc_indices[-30:] if scores[i] > 0]
|
| 55 |
|