Spaces:
Running
16k prompt (#19)
Browse files* 16k context window
* threshold and top_k changed to 10
* changed max_tokens to fit new context window
* changed before and after prompts with more details
* added number of chunks per source
* formatting
* used online dataset
* remove before docs, remove summarize keywords from prompt
* answer between 1 and 8 paragraphs
* add prompt text_before_docs back + overall prompt modifications
* format with black
* expand chatbot box to full height and change demo theme
* prompt change, ask to return 5 paragraphs
* change space name to dev
* new dataset curated TAI, no wiki
* new dataset curated TAI, no wiki
* change sources
* adding all datasets
* added all new csvs
* rm tmp
* update vector db push
* fix prompt for OOD detector
* Added cleaner sources and sources in refs
* ran black formatting
---------
Co-authored-by: Omar Solano <[email protected]>
Co-authored-by: Omar Solano <[email protected]>
Co-authored-by: Jeremy Pinto <[email protected]>
- README.md +3 -3
- app.py +48 -10
- cfg.py +24 -20
- data/embed_documents.py +3 -3
- data/markdown_parser.py +9 -7
- data/process_csvs_store.py +68 -0
- data/remove_extra_notion_titles.py +19 -0
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
-
title: TowardsAI 🤝 Buster
|
3 |
emoji: 🤖
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
app_file: app.py
|
8 |
pinned: false
|
|
|
1 |
---
|
2 |
+
title: TowardsAI 🤝 Buster - Dev
|
3 |
emoji: 🤖
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: blue
|
6 |
sdk: gradio
|
7 |
app_file: app.py
|
8 |
pinned: false
|
@@ -18,7 +18,21 @@ logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
18 |
logger = logging.getLogger(__name__)
|
19 |
logging.basicConfig(level=logging.INFO)
|
20 |
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
|
24 |
def format_sources(matched_documents: pd.DataFrame) -> str:
|
@@ -26,17 +40,29 @@ def format_sources(matched_documents: pd.DataFrame) -> str:
|
|
26 |
return ""
|
27 |
|
28 |
documents_answer_template: str = "📝 Here are the sources I used to answer your question:\n\n{documents}\n\n{footnote}"
|
29 |
-
document_template: str = "[🔗 {document.title}]({document.url}), relevance: {document.similarity_to_answer:2.1f} %"
|
30 |
|
31 |
matched_documents.similarity_to_answer = (
|
32 |
matched_documents.similarity_to_answer * 100
|
33 |
)
|
34 |
|
|
|
|
|
|
|
|
|
35 |
# drop duplicates, keep highest ranking ones
|
36 |
matched_documents = matched_documents.sort_values(
|
37 |
"similarity_to_answer", ascending=False
|
38 |
).drop_duplicates("title", keep="first")
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
documents = "\n".join(
|
41 |
[
|
42 |
document_template.format(document=document)
|
@@ -77,7 +103,13 @@ def get_answer(history, sources: Optional[list[str]] = None):
|
|
77 |
completion = get_empty_source_completion(user_input)
|
78 |
|
79 |
else:
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
history[-1][1] = ""
|
83 |
|
@@ -87,26 +119,32 @@ def get_answer(history, sources: Optional[list[str]] = None):
|
|
87 |
yield history, completion
|
88 |
|
89 |
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
with demo:
|
92 |
with gr.Row():
|
93 |
gr.Markdown(
|
94 |
-
"<h3><center>Buster 🤖: A Question-Answering Bot for
|
95 |
)
|
96 |
-
|
97 |
source_selection = gr.Dropdown(
|
98 |
-
choices=
|
99 |
label="Select Sources",
|
100 |
-
value=
|
101 |
multiselect=True,
|
102 |
)
|
103 |
|
104 |
-
chatbot = gr.Chatbot()
|
105 |
|
106 |
with gr.Row():
|
107 |
question = gr.Textbox(
|
108 |
label="What's your question?",
|
109 |
-
placeholder="Ask a question to AI
|
110 |
lines=1,
|
111 |
)
|
112 |
submit = gr.Button(value="Send", variant="secondary")
|
|
|
18 |
logger = logging.getLogger(__name__)
|
19 |
logging.basicConfig(level=logging.INFO)
|
20 |
|
21 |
+
AVAILABLE_SOURCES_UI = [
|
22 |
+
"Toward's AI",
|
23 |
+
"HuggingFace",
|
24 |
+
"Wikipedia",
|
25 |
+
"Gen AI 360: LangChain",
|
26 |
+
"Gen AI 360: LLMs",
|
27 |
+
]
|
28 |
+
|
29 |
+
AVAILABLE_SOURCES = [
|
30 |
+
"towards_ai",
|
31 |
+
"hf_transformers",
|
32 |
+
"wikipedia",
|
33 |
+
"langchain_course",
|
34 |
+
"llm_course",
|
35 |
+
]
|
36 |
|
37 |
|
38 |
def format_sources(matched_documents: pd.DataFrame) -> str:
|
|
|
40 |
return ""
|
41 |
|
42 |
documents_answer_template: str = "📝 Here are the sources I used to answer your question:\n\n{documents}\n\n{footnote}"
|
43 |
+
document_template: str = "[🔗 {document.source}: {document.title}]({document.url}), highest relevance: {document.similarity_to_answer:2.1f} % | # total chunks matched: {document.repetition:d}"
|
44 |
|
45 |
matched_documents.similarity_to_answer = (
|
46 |
matched_documents.similarity_to_answer * 100
|
47 |
)
|
48 |
|
49 |
+
matched_documents["repetition"] = matched_documents.groupby("title")[
|
50 |
+
"title"
|
51 |
+
].transform("size")
|
52 |
+
|
53 |
# drop duplicates, keep highest ranking ones
|
54 |
matched_documents = matched_documents.sort_values(
|
55 |
"similarity_to_answer", ascending=False
|
56 |
).drop_duplicates("title", keep="first")
|
57 |
|
58 |
+
# Revert back to correct display
|
59 |
+
display_source_to_ui = {
|
60 |
+
ui: src for ui, src in zip(AVAILABLE_SOURCES, AVAILABLE_SOURCES_UI)
|
61 |
+
}
|
62 |
+
matched_documents["source"] = matched_documents["source"].replace(
|
63 |
+
display_source_to_ui
|
64 |
+
)
|
65 |
+
|
66 |
documents = "\n".join(
|
67 |
[
|
68 |
document_template.format(document=document)
|
|
|
103 |
completion = get_empty_source_completion(user_input)
|
104 |
|
105 |
else:
|
106 |
+
# Go to code names
|
107 |
+
display_ui_to_source = {
|
108 |
+
ui: src for ui, src in zip(AVAILABLE_SOURCES_UI, AVAILABLE_SOURCES)
|
109 |
+
}
|
110 |
+
|
111 |
+
sources_renamed = [display_ui_to_source[disp] for disp in sources]
|
112 |
+
completion = buster.process_input(user_input, sources=sources_renamed)
|
113 |
|
114 |
history[-1][1] = ""
|
115 |
|
|
|
119 |
yield history, completion
|
120 |
|
121 |
|
122 |
+
CSS = """
|
123 |
+
.contain { display: flex; flex-direction: column; }
|
124 |
+
.gradio-container { height: 100vh !important; }
|
125 |
+
#component-0 { height: 100%; }
|
126 |
+
#chatbot { flex-grow: 1; overflow: auto;}
|
127 |
+
"""
|
128 |
+
theme = gr.themes.Base()
|
129 |
+
demo = gr.Blocks(css=CSS, theme=theme)
|
130 |
with demo:
|
131 |
with gr.Row():
|
132 |
gr.Markdown(
|
133 |
+
"<h3><center>Toward's AI x Buster 🤖: A Question-Answering Bot for anything AI-related</center></h3>"
|
134 |
)
|
|
|
135 |
source_selection = gr.Dropdown(
|
136 |
+
choices=AVAILABLE_SOURCES_UI,
|
137 |
label="Select Sources",
|
138 |
+
value=AVAILABLE_SOURCES_UI,
|
139 |
multiselect=True,
|
140 |
)
|
141 |
|
142 |
+
chatbot = gr.Chatbot(elem_id="chatbot")
|
143 |
|
144 |
with gr.Row():
|
145 |
question = gr.Textbox(
|
146 |
label="What's your question?",
|
147 |
+
placeholder="Ask a question to our AI tutor here...",
|
148 |
lines=1,
|
149 |
)
|
150 |
submit = gr.Button(value="Send", variant="secondary")
|
@@ -43,9 +43,11 @@ buster_cfg = BusterConfig(
|
|
43 |
"use_reranking": True,
|
44 |
"invalid_question_response": "This question does not seem relevant to my current knowledge.",
|
45 |
"check_question_prompt": """You are a chatbot, answering questions about large language models and artificial intelligence.
|
46 |
-
|
|
|
47 |
Users will learn to build LLM-powered apps, with LangChain & Deep Lake among other technologies.
|
48 |
-
As long as a question is somewhat related to the topic, respond 'true'. If a question is
|
|
|
49 |
|
50 |
For example:
|
51 |
|
@@ -55,7 +57,8 @@ true
|
|
55 |
Q: What is the meaning of life?
|
56 |
false
|
57 |
|
58 |
-
|
|
|
59 |
"completion_kwargs": {
|
60 |
"model": "gpt-3.5-turbo",
|
61 |
"stream": False,
|
@@ -64,9 +67,9 @@ A user will now submit a question. Respond 'true' if it is valid, respond 'false
|
|
64 |
},
|
65 |
retriever_cfg={
|
66 |
"path": f"{DEEPLAKE_DATASET_PATH}",
|
67 |
-
"top_k":
|
68 |
-
"thresh": 0.
|
69 |
-
"max_tokens":
|
70 |
"embedding_model": "text-embedding-ada-002",
|
71 |
"exec_option": "compute_engine",
|
72 |
"use_tql": True,
|
@@ -77,41 +80,41 @@ A user will now submit a question. Respond 'true' if it is valid, respond 'false
|
|
77 |
},
|
78 |
completion_cfg={
|
79 |
"completion_kwargs": {
|
80 |
-
"model": "gpt-3.5-turbo",
|
81 |
"stream": True,
|
82 |
"temperature": 0,
|
83 |
},
|
84 |
},
|
85 |
tokenizer_cfg={
|
86 |
-
"model_name": "gpt-3.5-turbo",
|
87 |
},
|
88 |
documents_formatter_cfg={
|
89 |
-
"max_tokens":
|
90 |
"columns": ["content", "source", "title"],
|
91 |
},
|
92 |
prompt_formatter_cfg={
|
93 |
-
"max_tokens":
|
94 |
"text_before_docs": (
|
95 |
-
"You are a
|
96 |
"You are provided information found in the json documentation. "
|
97 |
-
"Only respond with
|
98 |
-
"If the answer is in the documentation,
|
99 |
"If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
|
100 |
"Here is the information you can use (json documentation): "
|
101 |
),
|
102 |
"text_after_docs": (
|
103 |
"REMEMBER:\n"
|
104 |
-
"You are a
|
105 |
-
"You are provided information found in the . "
|
106 |
"Here are the rules you must follow:\n"
|
107 |
-
"* Only respond with
|
108 |
-
"* If the answer is in the documentation,
|
109 |
"* If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
|
110 |
-
"* Only
|
111 |
-
"* Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
|
112 |
"* Do not reference any links, urls or hyperlinks in your answers.\n"
|
113 |
"* Make sure to format your answers in Markdown format, including code block and snippets.\n"
|
114 |
-
"* If you do not know the answer to a question, or if it is completely irrelevant to the
|
115 |
"'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the topics I'm trained on. Is there anything else I can assist you with?'"
|
116 |
"For example:\n"
|
117 |
"What is the meaning of life for a qa bot?\n"
|
@@ -139,4 +142,5 @@ def setup_buster(buster_cfg):
|
|
139 |
buster: Buster = Buster(
|
140 |
retriever=retriever, document_answerer=document_answerer, validator=validator
|
141 |
)
|
|
|
142 |
return buster
|
|
|
43 |
"use_reranking": True,
|
44 |
"invalid_question_response": "This question does not seem relevant to my current knowledge.",
|
45 |
"check_question_prompt": """You are a chatbot, answering questions about large language models and artificial intelligence.
|
46 |
+
Your job is to determine whether user's question is valid or not. Users will not always submit a question either.
|
47 |
+
Users will ask all sorts of questions, and some might be tangentially related to artificial intelligence.
|
48 |
Users will learn to build LLM-powered apps, with LangChain & Deep Lake among other technologies.
|
49 |
+
As long as a question is somewhat related to the topic of AI, respond 'true'. If a question is on a different subject or unrelated, respond 'false'.
|
50 |
+
Make sure the question is a valid question.
|
51 |
|
52 |
For example:
|
53 |
|
|
|
57 |
Q: What is the meaning of life?
|
58 |
false
|
59 |
|
60 |
+
Q:
|
61 |
+
""",
|
62 |
"completion_kwargs": {
|
63 |
"model": "gpt-3.5-turbo",
|
64 |
"stream": False,
|
|
|
67 |
},
|
68 |
retriever_cfg={
|
69 |
"path": f"{DEEPLAKE_DATASET_PATH}",
|
70 |
+
"top_k": 10,
|
71 |
+
"thresh": 0.55,
|
72 |
+
"max_tokens": 13000,
|
73 |
"embedding_model": "text-embedding-ada-002",
|
74 |
"exec_option": "compute_engine",
|
75 |
"use_tql": True,
|
|
|
80 |
},
|
81 |
completion_cfg={
|
82 |
"completion_kwargs": {
|
83 |
+
"model": "gpt-3.5-turbo-16k",
|
84 |
"stream": True,
|
85 |
"temperature": 0,
|
86 |
},
|
87 |
},
|
88 |
tokenizer_cfg={
|
89 |
+
"model_name": "gpt-3.5-turbo-16k",
|
90 |
},
|
91 |
documents_formatter_cfg={
|
92 |
+
"max_tokens": 13500,
|
93 |
"columns": ["content", "source", "title"],
|
94 |
},
|
95 |
prompt_formatter_cfg={
|
96 |
+
"max_tokens": 13500,
|
97 |
"text_before_docs": (
|
98 |
+
"You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine tuning models, giving memory to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, Langchain, making LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context."
|
99 |
"You are provided information found in the json documentation. "
|
100 |
+
"Only respond with information inside the json documentation. DO NOT use additional information, even if you know the answer. "
|
101 |
+
"If the answer is in the documentation, answer the question (depending on the questions and the variety of relevant information in the json documentation, answer in 5 paragraphs."
|
102 |
"If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
|
103 |
"Here is the information you can use (json documentation): "
|
104 |
),
|
105 |
"text_after_docs": (
|
106 |
"REMEMBER:\n"
|
107 |
+
"You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine tuning models, giving memory to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, Langchain, making LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context."
|
108 |
+
"You are provided information found in the json documentation. "
|
109 |
"Here are the rules you must follow:\n"
|
110 |
+
"* Only respond with information inside the json documentation. DO NOT provide additional information, even if you know the answer. "
|
111 |
+
"* If the answer is in the documentation, answer the question (depending on the questions and the variety of relevant information in the json documentation, answer in 5 paragraphs. "
|
112 |
"* If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
|
113 |
+
"* Only use information summarized from the json documentation, do not respond otherwise. "
|
114 |
+
"* Do not refer to the json documentation directly, but use the instructions provided within it to answer questions. "
|
115 |
"* Do not reference any links, urls or hyperlinks in your answers.\n"
|
116 |
"* Make sure to format your answers in Markdown format, including code block and snippets.\n"
|
117 |
+
"* If you do not know the answer to a question, or if it is completely irrelevant to the AI courses, simply reply with:\n"
|
118 |
"'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the topics I'm trained on. Is there anything else I can assist you with?'"
|
119 |
"For example:\n"
|
120 |
"What is the meaning of life for a qa bot?\n"
|
|
|
142 |
buster: Buster = Buster(
|
143 |
retriever=retriever, document_answerer=document_answerer, validator=validator
|
144 |
)
|
145 |
+
|
146 |
return buster
|
@@ -2,9 +2,9 @@ import pandas as pd
|
|
2 |
from buster.documents_manager import DeepLakeDocumentsManager
|
3 |
|
4 |
if __name__ == "__main__":
|
5 |
-
vector_store_path = "
|
6 |
-
chunk_file = "./
|
7 |
-
overwrite =
|
8 |
|
9 |
df = pd.read_csv(chunk_file)
|
10 |
|
|
|
2 |
from buster.documents_manager import DeepLakeDocumentsManager
|
3 |
|
4 |
if __name__ == "__main__":
|
5 |
+
vector_store_path = "/Users/louis/Downloads/wiki_tai_langchain_hf_llm"
|
6 |
+
chunk_file = "./llm_course.csv"
|
7 |
+
overwrite = False
|
8 |
|
9 |
df = pd.read_csv(chunk_file)
|
10 |
|
@@ -50,13 +50,13 @@ def get_title_link_from_md_title(md_title: str, title_link_data: dict):
|
|
50 |
return data["title"], data["link"]
|
51 |
# default back to course link if not found...
|
52 |
print("\nNot found: ", md_title)
|
53 |
-
return md_title, "https://learn.activeloop.ai/courses/
|
54 |
|
55 |
|
56 |
if __name__ == "__main__":
|
57 |
folder_path = "/path/to/folder/with/md_content/"
|
58 |
-
folder_path = "/Users/
|
59 |
-
|
60 |
md_files = find_md_files(folder_path)
|
61 |
|
62 |
headers_to_split_on = [
|
@@ -74,14 +74,16 @@ if __name__ == "__main__":
|
|
74 |
|
75 |
from tqdm import tqdm
|
76 |
|
77 |
-
with open("title_link_langchaincourse.json", "r") as f:
|
|
|
|
|
|
|
78 |
title_link_data = json.load(f)
|
79 |
|
80 |
for md_file in tqdm(md_files):
|
81 |
md_title = md_file["title"]
|
82 |
md_raw_content = md_file["content"]
|
83 |
md_header_splits = markdown_splitter.split_text(md_raw_content)
|
84 |
-
|
85 |
title, link = get_title_link_from_md_title(
|
86 |
md_title, title_link_data=title_link_data
|
87 |
)
|
@@ -100,7 +102,7 @@ if __name__ == "__main__":
|
|
100 |
chunk = {
|
101 |
"title": title,
|
102 |
"content": headers + "\n" + substring,
|
103 |
-
"source": "
|
104 |
"url": link,
|
105 |
}
|
106 |
chunks.append(chunk)
|
@@ -110,4 +112,4 @@ if __name__ == "__main__":
|
|
110 |
df = drop_outlier_chunks(df, max_tokens_by_chunk=2000)
|
111 |
|
112 |
print(f"Exported {len(df)} chunks from {len(md_files)} articles.")
|
113 |
-
df.to_csv("
|
|
|
50 |
return data["title"], data["link"]
|
51 |
# default back to course link if not found...
|
52 |
print("\nNot found: ", md_title)
|
53 |
+
return md_title, "https://learn.activeloop.ai/courses/llms/"
|
54 |
|
55 |
|
56 |
if __name__ == "__main__":
|
57 |
folder_path = "/path/to/folder/with/md_content/"
|
58 |
+
folder_path = "/Users/louis/Downloads/llm_course"
|
59 |
+
# folder_path = "/Users/louis/Downloads/d22d1e98-345f-490d-870e-3b082938741c_Export-0a33c13f-6d42-4a94-8f23-7459e7b2c024/LangChain & Vector Databases in Production 92657e0d65da4201bfdd6db915a4eb9f"
|
60 |
md_files = find_md_files(folder_path)
|
61 |
|
62 |
headers_to_split_on = [
|
|
|
74 |
|
75 |
from tqdm import tqdm
|
76 |
|
77 |
+
# with open("data/title_link_langchaincourse.json", "r") as f:
|
78 |
+
# title_link_data = json.load(f)
|
79 |
+
|
80 |
+
with open("/Users/louis/Downloads/output2.json", "r") as f:
|
81 |
title_link_data = json.load(f)
|
82 |
|
83 |
for md_file in tqdm(md_files):
|
84 |
md_title = md_file["title"]
|
85 |
md_raw_content = md_file["content"]
|
86 |
md_header_splits = markdown_splitter.split_text(md_raw_content)
|
|
|
87 |
title, link = get_title_link_from_md_title(
|
88 |
md_title, title_link_data=title_link_data
|
89 |
)
|
|
|
102 |
chunk = {
|
103 |
"title": title,
|
104 |
"content": headers + "\n" + substring,
|
105 |
+
"source": "llm_course",
|
106 |
"url": link,
|
107 |
}
|
108 |
chunks.append(chunk)
|
|
|
112 |
df = drop_outlier_chunks(df, max_tokens_by_chunk=2000)
|
113 |
|
114 |
print(f"Exported {len(df)} chunks from {len(md_files)} articles.")
|
115 |
+
df.to_csv("llm_course.csv")
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import time
|
3 |
+
import os
|
4 |
+
from buster.documents_manager import DeepLakeDocumentsManager
|
5 |
+
from deeplake.core.vectorstore import VectorStore
|
6 |
+
|
7 |
+
DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "dev_vector_store")
|
8 |
+
DEEPLAKE_ORG = os.getenv("DEEPLAKE_ORG", "towards_ai")
|
9 |
+
|
10 |
+
df1 = pd.read_csv("./data/llm_course.csv")
|
11 |
+
df2 = pd.read_csv("./data/hf_transformers.csv")
|
12 |
+
df3 = pd.read_csv("./data/langchain_course.csv")
|
13 |
+
df4 = pd.read_csv("./data/filtered_tai_v2.csv")
|
14 |
+
df5 = pd.read_csv("./data/wiki.csv") # , encoding="ISO-8859-1")
|
15 |
+
|
16 |
+
|
17 |
+
dataset_path = f"hub://{DEEPLAKE_ORG}/{DEEPLAKE_DATASET}"
|
18 |
+
|
19 |
+
dm = DeepLakeDocumentsManager(
|
20 |
+
vector_store_path=dataset_path,
|
21 |
+
overwrite=True,
|
22 |
+
required_columns=["url", "content", "source", "title"],
|
23 |
+
)
|
24 |
+
|
25 |
+
dm.batch_add(
|
26 |
+
df=df1,
|
27 |
+
batch_size=3000,
|
28 |
+
min_time_interval=60,
|
29 |
+
num_workers=32,
|
30 |
+
csv_filename="embeddings.csv",
|
31 |
+
csv_overwrite=False,
|
32 |
+
)
|
33 |
+
|
34 |
+
dm.batch_add(
|
35 |
+
df=df2,
|
36 |
+
batch_size=3000,
|
37 |
+
min_time_interval=60,
|
38 |
+
num_workers=32,
|
39 |
+
csv_filename="embeddings.csv",
|
40 |
+
csv_overwrite=False,
|
41 |
+
)
|
42 |
+
|
43 |
+
dm.batch_add(
|
44 |
+
df=df3,
|
45 |
+
batch_size=3000,
|
46 |
+
min_time_interval=60,
|
47 |
+
num_workers=32,
|
48 |
+
csv_filename="embeddings.csv",
|
49 |
+
csv_overwrite=False,
|
50 |
+
)
|
51 |
+
|
52 |
+
dm.batch_add(
|
53 |
+
df=df4,
|
54 |
+
batch_size=3000,
|
55 |
+
min_time_interval=60,
|
56 |
+
num_workers=32,
|
57 |
+
csv_filename="embeddings.csv",
|
58 |
+
csv_overwrite=False,
|
59 |
+
)
|
60 |
+
|
61 |
+
dm.batch_add(
|
62 |
+
df=df5,
|
63 |
+
batch_size=3000,
|
64 |
+
min_time_interval=60,
|
65 |
+
num_workers=32,
|
66 |
+
csv_filename="embeddings.csv",
|
67 |
+
csv_overwrite=False,
|
68 |
+
)
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
# Set your folder path
|
4 |
+
folder_path = "/Users/louis/Downloads/llm_course"
|
5 |
+
# Get all filenames from the folder
|
6 |
+
filenames = os.listdir(folder_path)
|
7 |
+
|
8 |
+
# Iterate through each filename
|
9 |
+
for filename in filenames:
|
10 |
+
last_space_index = filename.rfind(" ")
|
11 |
+
|
12 |
+
# If a space is found, rename the file
|
13 |
+
if last_space_index != -1:
|
14 |
+
new_filename = filename[:last_space_index]
|
15 |
+
old_path = os.path.join(folder_path, filename)
|
16 |
+
new_path = os.path.join(folder_path, new_filename)
|
17 |
+
|
18 |
+
os.rename(old_path, new_path)
|
19 |
+
print(f"Renamed {filename} to {new_filename}")
|