agshiv92 commited on
Commit
10e8311
·
verified ·
1 Parent(s): 610ad42

Upload 3 files

Browse files
task6_model_deployment/scripts/query_engine.py CHANGED
@@ -1,105 +1,143 @@
1
- import os
2
- import yaml
3
- from dotenv import load_dotenv
4
- from pinecone import Pinecone
5
- from llama_index.vector_stores.pinecone import PineconeVectorStore
6
- from llama_index.core import VectorStoreIndex
7
- from llama_index.core.response.pprint_utils import pprint_source_node
8
- from llama_index.core import Settings
9
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
10
- from llama_index.llms.groq import Groq
11
-
12
- # Load environment variables from the .env file
13
- load_dotenv()
14
-
15
- # Function to load YAML configuration
16
- def load_config(config_path):
17
- with open(config_path, 'r') as file:
18
- config = yaml.safe_load(file)
19
- return config
20
-
21
- # Pinecone Index Connection
22
- def index_connection(config_path):
23
- """
24
- Initializes the Pinecone client and retrieves the index using the provided YAML configuration.
25
-
26
- Args:
27
- config_path (str): Path to the YAML configuration file.
28
-
29
- Returns:
30
- index: The initialized Pinecone index.
31
- """
32
- # Load the configuration from a YAML file
33
- config = load_config(config_path)
34
- embed_model_name = config['embeddings']['model_name']
35
- embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
36
- model_name = config['model']['model_name']
37
- Settings.llm = Groq(model=model_name, api_key=os.getenv('GROQ_API_KEY'))
38
- Settings.embed_model = embed_model
39
- # Initialize the Pinecone client
40
- pc = Pinecone(
41
- api_key=os.getenv('PINECONE_API_KEY') # Get the Pinecone API key from the environment
42
- )
43
- index_name = config['pinecone']['index_name']
44
- index = pc.Index(index_name) # Get the Pinecone index using the index name from the config
45
- return index
46
-
47
- # Initialize Pinecone Vector Store and Retriever
48
- def initialize_retriever(pinecone_index):
49
- """
50
- Initializes the Pinecone vector store and sets up the retriever.
51
-
52
- Args:
53
- pinecone_index: The Pinecone index object.
54
-
55
- Returns:
56
- retriever: The initialized retriever for querying the vector store.
57
- """
58
-
59
- # Initialize Pinecone Vector Store
60
- vector_store = PineconeVectorStore(pinecone_index=pinecone_index, text_key="_node_content")
61
-
62
- # Create the retriever using the VectorStoreIndex and configure similarity_top_k
63
- index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
64
-
65
- return index
66
-
67
- # Query the Pinecone Index
68
- def index_retrieval(index, query_text):
69
- """
70
- Queries the Pinecone index using the provided retriever and query text.
71
-
72
- Args:
73
- retriever: The initialized retriever.
74
- query_text (str): The text query to search for.
75
-
76
- Returns:
77
- str: Query result from the Pinecone index.
78
- """
79
-
80
- # Execute the query using the retriever
81
- query_engine = index.as_query_engine()
82
- response = query_engine.query(query_text)
83
-
84
- print(response)# Pretty print the source node for clarity
85
-
86
- return response
87
-
88
- # Example usage
89
- if __name__ == "__main__":
90
- # Dynamically determine the path to the config file
91
- script_dir = os.path.dirname(os.path.abspath(__file__)) # Get the current script directory
92
- base_dir = os.path.dirname(script_dir) # Go one level up
93
- config_path = os.path.join(base_dir, 'configs', 'config.yaml') # Path to 'config.yaml' in the 'configs' directory
94
-
95
- # Step 1: Initialize Pinecone Connection
96
- pinecone_index = index_connection(config_path=config_path)
97
-
98
- # Step 2: Initialize the Retriever
99
- retriever = initialize_retriever(pinecone_index)
100
-
101
- # Step 3: Query the Pinecone index
102
- query_text = """How much can the Minister of Health pay out of the Consolidated Revenue Fund in relation to coronavirus disease 2019 (COVID-19) tests"""
103
- response = index_retrieval(retriever, query_text)
104
-
105
- # Print the result (already printed by pprint_source_node)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yaml
3
+ from dotenv import load_dotenv
4
+ from pinecone import Pinecone
5
+ from llama_index.vector_stores.pinecone import PineconeVectorStore
6
+ from llama_index.core import VectorStoreIndex
7
+ from llama_index.core.response.pprint_utils import pprint_source_node
8
+ from llama_index.core import Settings
9
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
10
+ from llama_index.llms.groq import Groq
11
+ from llama_index.core.tools import QueryEngineTool
12
+ from llama_index.core.query_engine import RouterQueryEngine
13
+ from llama_index.core.selectors import LLMSingleSelector, LLMMultiSelector
14
+ from llama_index.core.selectors import (
15
+ PydanticMultiSelector,
16
+ PydanticSingleSelector,
17
+ )
18
+ from llama_index.core import PromptTemplate
19
+ from llama_index.core.response_synthesizers import TreeSummarize
20
+ import nest_asyncio
21
+ import asyncio
22
+ nest_asyncio.apply()
23
+ # Load environment variables from the .env file
24
+ load_dotenv()
25
+
26
+ # Function to load YAML configuration
27
+ def load_config(config_path):
28
+ with open(config_path, 'r') as file:
29
+ config = yaml.safe_load(file)
30
+ return config
31
+ def load_prompt_template(prompt_template_path):
32
+ with open(prompt_template_path, 'r') as file:
33
+ prompt_template = yaml.safe_load(file)
34
+ return prompt_template
35
+ # Pinecone Index Connection
36
+ def index_connection(config_path):
37
+ """
38
+ Initializes the Pinecone client and retrieves the index using the provided YAML configuration.
39
+
40
+ Args:
41
+ config_path (str): Path to the YAML configuration file.
42
+
43
+ Returns:
44
+ index: The initialized Pinecone index.
45
+ """
46
+ # Load the configuration from a YAML file
47
+ config = load_config(config_path)
48
+ embed_model_name = config['embeddings']['model_name']
49
+ embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
50
+ model_name = config['model']['model_name']
51
+ Settings.llm = Groq(model=model_name, api_key=os.getenv('GROQ_API_KEY'))
52
+ Settings.embed_model = embed_model
53
+ # Initialize the Pinecone client
54
+ pc = Pinecone(
55
+ api_key=os.getenv('PINECONE_API_KEY') # Get the Pinecone API key from the environment
56
+ )
57
+ index_name = config['pinecone']['index_name']
58
+ summary_index_name = config['pinecone']['summary_index_name']
59
+ index = pc.Index(index_name)
60
+ summary_index = pc.Index(summary_index_name) # Get the Pinecone index using the index name from the config
61
+ return index,summary_index
62
+
63
+ # Initialize Pinecone Vector Store and Retriever
64
+ def initialize_retriever(pinecone_index,summary_index):
65
+ """
66
+ Initializes the Pinecone vector store and sets up the retriever.
67
+
68
+ Args:
69
+ pinecone_index: The Pinecone index object.
70
+
71
+ Returns:
72
+ retriever: The initialized retriever for querying the vector store.
73
+ """
74
+
75
+ # Initialize Pinecone Vector Store
76
+ vector_store = PineconeVectorStore(pinecone_index=pinecone_index, text_key="_node_content")
77
+ summary_vector_store = PineconeVectorStore(pinecone_index=summary_index, text_key="_node_content")
78
+ # Create the retriever using the VectorStoreIndex and configure similarity_top_k
79
+ index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
80
+ summary_index = VectorStoreIndex.from_vector_store(vector_store=summary_vector_store)
81
+ return index,summary_index
82
+
83
+ # Query the Pinecone Index
84
+ def index_retrieval(index, summary_index, query_text):
85
+ """
86
+ Queries the Pinecone index using the provided retriever and query text.
87
+
88
+ Args:
89
+ retriever: The initialized retriever.
90
+ query_text (str): The text query to search for.
91
+
92
+ Returns:
93
+ str: Query result from the Pinecone index.
94
+ """
95
+ script_dir = os.path.dirname(os.path.abspath(__file__)) # Get the current script directory
96
+ base_dir = os.path.dirname(script_dir)
97
+ prompt_template_path = os.path.join(base_dir, 'model', 'prompt_template.yaml')
98
+ prompt_template = load_prompt_template(prompt_template_path)
99
+ QA_PROMPT = PromptTemplate(prompt_template['QA_PROMPT_TMPL'])
100
+ # Execute the query using the retriever
101
+ vector_query_engine = index.as_query_engine(text_qa_template=QA_PROMPT)
102
+ summary_query_engine = summary_index.as_query_engine(text_qa_template=QA_PROMPT)
103
+
104
+ vector_tool = QueryEngineTool.from_defaults(
105
+ query_engine=vector_query_engine,
106
+ description="Useful for answering questions about this context",
107
+ )
108
+
109
+ summary_tool = QueryEngineTool.from_defaults(
110
+ query_engine=summary_query_engine,
111
+ description="Useful for answering questions about this context",
112
+ )
113
+
114
+ tree_summarize = TreeSummarize(
115
+ summary_template=PromptTemplate(prompt_template['TREE_SUMMARIZE_PROMPT_TMPL'])
116
+ )
117
+ query_engine = RouterQueryEngine(
118
+ selector=LLMMultiSelector.from_defaults(),
119
+ query_engine_tools=[
120
+ vector_tool,
121
+ summary_tool,
122
+ ],
123
+ summarizer=tree_summarize,)
124
+ response = query_engine.query(query_text)
125
+ return response
126
+ # Example usage
127
+ if __name__ == "__main__":
128
+ # Dynamically determine the path to the config file
129
+ script_dir = os.path.dirname(os.path.abspath(__file__)) # Get the current script directory
130
+ base_dir = os.path.dirname(script_dir) # Go one level up
131
+ config_path = os.path.join(base_dir, 'configs', 'config.yaml') # Path to 'config.yaml' in the 'configs' directory
132
+
133
+ # Step 1: Initialize Pinecone Connection
134
+ pinecone_index,summary_index = index_connection(config_path=config_path)
135
+
136
+ # Step 2: Initialize the Retriever
137
+ retriever,summary_retriever = initialize_retriever(pinecone_index,summary_index)
138
+
139
+ # Step 3: Query the Pinecone index
140
+ query_text = """How much can the Minister of Health pay out of the Consolidated Revenue Fund in relation to coronavirus disease 2019 (COVID-19) tests"""
141
+ response = index_retrieval(retriever, summary_retriever, query_text)
142
+ print(response)
143
+ # Print the result (already printed by pprint_source_node)
task6_model_deployment/scripts/vector_database_creation.py CHANGED
@@ -1,50 +1,76 @@
1
- import yaml
2
- import os
3
- import os
4
- from pinecone import Pinecone, ServerlessSpec
5
- from dotenv import load_dotenv
6
-
7
- load_dotenv()
8
- script_dir = os.path.dirname(os.path.abspath(__file__))
9
-
10
- # Construct the base directory (one level up from the script directory)
11
- base_dir = os.path.dirname(script_dir)
12
-
13
- # Construct the path to the config file
14
- config_path = os.path.join(base_dir, 'configs', 'config.yaml')
15
-
16
- def load_config(file_path):
17
- with open(file_path, 'r') as file:
18
- config = yaml.safe_load(file)
19
- return config
20
-
21
- def creation_of_vector_database():
22
- # Load the configuration from a YAML file
23
- config = load_config(config_path)
24
-
25
- # Initialize the Pinecone client
26
- pc = Pinecone(
27
- api_key=os.getenv('PINECONE_API_KEY')) # Ensure your API key is set in the environment variables
28
-
29
- # Connect to the Pinecone index
30
- index_name = config['pinecone']['index_name']
31
- dimension = config['pinecone']['dimension']
32
- metric = config['pinecone']['metric']
33
- # file_path = config['file_location']['file_path']
34
- cloud = config['pinecone']['cloud']
35
- region = config['pinecone']['region']
36
-
37
- if index_name not in pc.list_indexes().names():
38
- pc.create_index(
39
- name=index_name,
40
- dimension=dimension,
41
- metric=metric,
42
- spec=ServerlessSpec(
43
- cloud=cloud, # Specify your preferred cloud provider
44
- region=region # Specify your preferred region
45
- )
46
- )
47
-
48
- if __name__ == "__main__":
49
- creation_of_vector_database()
50
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import os
3
+ import os
4
+ from pinecone import Pinecone, ServerlessSpec
5
+ from dotenv import load_dotenv
6
+
7
+ load_dotenv()
8
+ script_dir = os.path.dirname(os.path.abspath(__file__))
9
+
10
+ # Construct the base directory (one level up from the script directory)
11
+ base_dir = os.path.dirname(script_dir)
12
+
13
+ # Construct the path to the config file
14
+ config_path = os.path.join(base_dir, 'configs', 'config.yaml')
15
+
16
+ def load_config(file_path):
17
+ with open(file_path, 'r') as file:
18
+ config = yaml.safe_load(file)
19
+ return config
20
+
21
+ def creation_of_vector_database():
22
+ # Load the configuration from a YAML file
23
+ config = load_config(config_path)
24
+
25
+ # Initialize the Pinecone client
26
+ pc = Pinecone(
27
+ api_key=os.getenv('PINECONE_API_KEY')) # Ensure your API key is set in the environment variables
28
+
29
+ # Connect to the Pinecone index
30
+ index_name = config['pinecone']['index_name']
31
+ dimension = config['pinecone']['dimension']
32
+ metric = config['pinecone']['metric']
33
+ # file_path = config['file_location']['file_path']
34
+ cloud = config['pinecone']['cloud']
35
+ region = config['pinecone']['region']
36
+
37
+ if index_name not in pc.list_indexes().names():
38
+ pc.create_index(
39
+ name=index_name,
40
+ dimension=dimension,
41
+ metric=metric,
42
+ spec=ServerlessSpec(
43
+ cloud=cloud, # Specify your preferred cloud provider
44
+ region=region # Specify your preferred region
45
+ )
46
+ )
47
+ def creation_of_summary_database():
48
+ # Load the configuration from a YAML file
49
+ config = load_config(config_path)
50
+
51
+ # Initialize the Pinecone client
52
+ pc = Pinecone(
53
+ api_key=os.getenv('PINECONE_API_KEY')) # Ensure your API key is set in the environment variables
54
+
55
+ # Connect to the Pinecone index
56
+ index_name = config['pinecone']['summary_index_name']
57
+ dimension = config['pinecone']['dimension']
58
+ metric = config['pinecone']['metric']
59
+ # file_path = config['file_location']['file_path']
60
+ cloud = config['pinecone']['cloud']
61
+ region = config['pinecone']['region']
62
+
63
+ if index_name not in pc.list_indexes().names():
64
+ pc.create_index(
65
+ name=index_name,
66
+ dimension=dimension,
67
+ metric=metric,
68
+ spec=ServerlessSpec(
69
+ cloud=cloud, # Specify your preferred cloud provider
70
+ region=region # Specify your preferred region
71
+ )
72
+ )
73
+ if __name__ == "__main__":
74
+ creation_of_vector_database()
75
+ creation_of_summary_database()
76
+
task6_model_deployment/scripts/vector_database_loading.py CHANGED
@@ -1,91 +1,112 @@
1
- import yaml
2
- import os
3
- import os
4
- from pinecone import Pinecone, ServerlessSpec
5
- from dotenv import load_dotenv
6
- import os
7
- from dotenv import load_dotenv
8
- from llama_index.vector_stores.pinecone import PineconeVectorStore
9
- from llama_index.core import StorageContext, VectorStoreIndex
10
- from llama_index.core import SimpleDirectoryReader
11
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
12
- from llama_index.core import Settings
13
- from llama_index.llms.groq import Groq
14
-
15
-
16
- load_dotenv()
17
- script_dir = os.path.dirname(os.path.abspath(__file__))
18
-
19
- # Construct the base directory (one level up from the script directory)
20
- base_dir = os.path.dirname(script_dir)
21
-
22
- # Construct the path to the config file
23
- config_path = os.path.join(base_dir, 'configs', 'config.yaml')
24
-
25
- def load_config(file_path):
26
- with open(file_path, 'r') as file:
27
- config = yaml.safe_load(file)
28
- return config
29
-
30
- def index_connection():
31
- # Load the configuration from a YAML file
32
- config = load_config(config_path)
33
-
34
- # Initialize the Pinecone client
35
- pc = Pinecone(
36
- api_key=os.getenv('PINECONE_API_KEY')
37
- )
38
- index_name = config['pinecone']['index_name']
39
- index = pc.Index(index_name)
40
- return index
41
-
42
- def chunk_documents(directory_path="./data/paul_graham"):
43
- """
44
- Reads documents from a specified directory and chunks them.
45
-
46
- Args:
47
- directory_path (str): The path of the directory containing documents to read.
48
-
49
- Returns:
50
- List[Document]: A list of document chunks that will be indexed.
51
- """
52
- # Load documents from the directory
53
- documents = SimpleDirectoryReader(directory_path).load_data()
54
-
55
- # Here you could apply further chunking logic if needed (for example, split large documents into smaller chunks)
56
- # For now, we're assuming the reader does basic chunking for us
57
-
58
- return documents
59
-
60
- # Part 2: Loading Chunks into Pinecone
61
- def load_chunks_into_pinecone(documents):
62
- config = load_config(config_path)
63
- pinecone_index = index_connection()
64
- model_name = config['model']['model_name']
65
- embed_model_name = config['embeddings']['model_name']
66
- print(embed_model_name)
67
- Settings.llm = Groq(model=model_name, api_key=os.getenv('GROQ_API_KEY'))
68
- Settings.chunk_size = config['pinecone']['dimension']
69
- embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
70
- Settings.embed_model = embed_model
71
-
72
- vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
73
-
74
- # Create the storage context
75
- storage_context = StorageContext.from_defaults(vector_store=vector_store)
76
-
77
- # Create the index with the documents
78
- index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
79
-
80
- print("Data has been successfully loaded into the Pinecone index!")
81
-
82
- return index
83
-
84
- # Example usage
85
- if __name__ == "__main__":
86
-
87
- # Step 1: Chunk the documents
88
- documents = chunk_documents(directory_path=r"C:\Users\agshi\Desktop\Omdena\Canada Policy\TorontoCanadaChapter_CanPolicyInsight\task6_model_deployment\assets")
89
-
90
- # Step 2: Load the chunks into Pinecone
91
- index = load_chunks_into_pinecone(documents)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import os
3
+ import os
4
+ from pinecone import Pinecone, ServerlessSpec
5
+ from dotenv import load_dotenv
6
+ import os
7
+ from dotenv import load_dotenv
8
+ from llama_index.vector_stores.pinecone import PineconeVectorStore
9
+ from llama_index.core import StorageContext, VectorStoreIndex
10
+ from llama_index.core import SimpleDirectoryReader
11
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
12
+ from llama_index.core import Settings
13
+ from llama_index.llms.groq import Groq
14
+ from llama_index.core.node_parser import SentenceSplitter
15
+ from llama_index.core import DocumentSummaryIndex
16
+
17
+ load_dotenv()
18
+ script_dir = os.path.dirname(os.path.abspath(__file__))
19
+
20
+ # Construct the base directory (one level up from the script directory)
21
+ base_dir = os.path.dirname(script_dir)
22
+
23
+ # Construct the path to the config file
24
+ config_path = os.path.join(base_dir, 'configs', 'config.yaml')
25
+
26
+ def load_config(file_path):
27
+ with open(file_path, 'r') as file:
28
+ config = yaml.safe_load(file)
29
+ return config
30
+
31
+ def index_connection():
32
+ # Load the configuration from a YAML file
33
+ config = load_config(config_path)
34
+
35
+ # Initialize the Pinecone client
36
+ pc = Pinecone(
37
+ api_key=os.getenv('PINECONE_API_KEY')
38
+ )
39
+ index_name = config['pinecone']['index_name']
40
+ index = pc.Index(index_name)
41
+ return index
42
+
43
+ def summary_index_connection():
44
+ # Load the configuration from a YAML file
45
+ config = load_config(config_path)
46
+
47
+ # Initialize the Pinecone client
48
+ pc = Pinecone(
49
+ api_key=os.getenv('PINECONE_API_KEY')
50
+ )
51
+ index_name = config['pinecone']['summary_index_name']
52
+ index = pc.Index(index_name)
53
+
54
+ return index
55
+
56
+ def chunk_documents(directory_path="./data/paul_graham"):
57
+ """
58
+ Reads documents from a specified directory and chunks them.
59
+
60
+ Args:
61
+ directory_path (str): The path of the directory containing documents to read.
62
+
63
+ Returns:
64
+ List[Document]: A list of document chunks that will be indexed.
65
+ """
66
+ # Load documents from the directory
67
+ documents = SimpleDirectoryReader(directory_path).load_data()
68
+
69
+ # Here you could apply further chunking logic if needed (for example, split large documents into smaller chunks)
70
+ # For now, we're assuming the reader does basic chunking for us
71
+
72
+ return documents
73
+
74
+ # Part 2: Loading Chunks into Pinecone
75
+ def load_chunks_into_pinecone(documents):
76
+ config = load_config(config_path)
77
+ pinecone_index = index_connection()
78
+ model_name = config['model']['model_name']
79
+ embed_model_name = config['embeddings']['model_name']
80
+ print(embed_model_name)
81
+ Settings.llm = Groq(model=model_name, api_key=os.getenv('GROQ_API_KEY'))
82
+ Settings.chunk_size = config['pinecone']['dimension']
83
+ embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
84
+ Settings.embed_model = embed_model
85
+
86
+ vector_store = PineconeVectorStore(pinecone_index=pinecone_index,add_sparse_vector=True)
87
+
88
+ # Create the storage context
89
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
90
+
91
+ # Create the index with the documents
92
+ index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
93
+
94
+ splitter = SentenceSplitter(chunk_size=1024)
95
+
96
+ summary_index = summary_index_connection()
97
+ summary_vector_store = PineconeVectorStore(pinecone_index=summary_index)
98
+ summary_storage_context = StorageContext.from_defaults(vector_store=summary_vector_store)
99
+ summary_index_from_documents = DocumentSummaryIndex.from_documents(documents, transformations=[splitter], storage_context=summary_storage_context,show_progress=True)
100
+
101
+
102
+ print("Data has been successfully loaded into the Pinecone index!")
103
+
104
+ return index,summary_index_from_documents
105
+ # Example usage
106
+ if __name__ == "__main__":
107
+
108
+ # Step 1: Chunk the documents
109
+ documents = chunk_documents(directory_path=r"C:\Users\agshi\Desktop\Omdena\Canada Policy\TorontoCanadaChapter_CanPolicyInsight\task6_model_deployment\assets")
110
+
111
+ # Step 2: Load the chunks into Pinecone
112
+ index,summary_index_from_documents = load_chunks_into_pinecone(documents)