Spaces:

agshiv92
/

TorontoCanadaChapter_CanPolicyInsight

Sleeping

App Files Files Community

agshiv92 commited on Sep 18, 2024

Commit

10e8311

verified ·

1 Parent(s): 610ad42

Upload 3 files

Browse files

Files changed (3) hide show

task6_model_deployment/scripts/query_engine.py +143 -105
task6_model_deployment/scripts/vector_database_creation.py +76 -50
task6_model_deployment/scripts/vector_database_loading.py +112 -91

task6_model_deployment/scripts/query_engine.py CHANGED Viewed

@@ -1,105 +1,143 @@
-import os
-import yaml
-from dotenv import load_dotenv
-from pinecone import Pinecone
-from llama_index.vector_stores.pinecone import PineconeVectorStore
-from llama_index.core import VectorStoreIndex
-from llama_index.core.response.pprint_utils import pprint_source_node
-from llama_index.core import Settings
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-from llama_index.llms.groq import Groq
-# Load environment variables from the .env file
-load_dotenv()
-# Function to load YAML configuration
-def load_config(config_path):
-    with open(config_path, 'r') as file:
-        config = yaml.safe_load(file)
-    return config
-# Pinecone Index Connection
-def index_connection(config_path):
-    """
-    Initializes the Pinecone client and retrieves the index using the provided YAML configuration.
-    Args:
-    config_path (str): Path to the YAML configuration file.
-    Returns:
-    index: The initialized Pinecone index.
-    """
-    # Load the configuration from a YAML file
-    config = load_config(config_path)
-    embed_model_name = config['embeddings']['model_name']
-    embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
-    model_name = config['model']['model_name']
-    Settings.llm = Groq(model=model_name, api_key=os.getenv('GROQ_API_KEY'))
-    Settings.embed_model = embed_model
-    # Initialize the Pinecone client
-    pc = Pinecone(
-        api_key=os.getenv('PINECONE_API_KEY')  # Get the Pinecone API key from the environment
-    )
-    index_name = config['pinecone']['index_name']
-    index = pc.Index(index_name)  # Get the Pinecone index using the index name from the config
-    return index
-# Initialize Pinecone Vector Store and Retriever
-def initialize_retriever(pinecone_index):
-    """
-    Initializes the Pinecone vector store and sets up the retriever.
-    Args:
-    pinecone_index: The Pinecone index object.
-    Returns:
-    retriever: The initialized retriever for querying the vector store.
-    """
-    # Initialize Pinecone Vector Store
-    vector_store = PineconeVectorStore(pinecone_index=pinecone_index, text_key="_node_content")
-    # Create the retriever using the VectorStoreIndex and configure similarity_top_k
-    index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
-    return index
-# Query the Pinecone Index
-def index_retrieval(index, query_text):
-    """
-    Queries the Pinecone index using the provided retriever and query text.
-    Args:
-    retriever: The initialized retriever.
-    query_text (str): The text query to search for.
-    Returns:
-    str: Query result from the Pinecone index.
-    """
-    # Execute the query using the retriever
-    query_engine = index.as_query_engine()
-    response = query_engine.query(query_text)
-    print(response)# Pretty print the source node for clarity
-    return response
-# Example usage
-if __name__ == "__main__":
-    # Dynamically determine the path to the config file
-    script_dir = os.path.dirname(os.path.abspath(__file__))  # Get the current script directory
-    base_dir = os.path.dirname(script_dir)  # Go one level up
-    config_path = os.path.join(base_dir, 'configs', 'config.yaml')  # Path to 'config.yaml' in the 'configs' directory
-    # Step 1: Initialize Pinecone Connection
-    pinecone_index = index_connection(config_path=config_path)
-    # Step 2: Initialize the Retriever
-    retriever = initialize_retriever(pinecone_index)
-    # Step 3: Query the Pinecone index
-    query_text = """How much can the Minister of Health pay out of the Consolidated Revenue Fund in relation to coronavirus disease 2019 (COVID-19) tests"""
-    response = index_retrieval(retriever, query_text)
-    # Print the result (already printed by pprint_source_node)

+import os
+import yaml
+from dotenv import load_dotenv
+from pinecone import Pinecone
+from llama_index.vector_stores.pinecone import PineconeVectorStore
+from llama_index.core import VectorStoreIndex
+from llama_index.core.response.pprint_utils import pprint_source_node
+from llama_index.core import Settings
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.llms.groq import Groq
+from llama_index.core.tools import QueryEngineTool
+from llama_index.core.query_engine import RouterQueryEngine
+from llama_index.core.selectors import LLMSingleSelector, LLMMultiSelector
+from llama_index.core.selectors import (
+    PydanticMultiSelector,
+    PydanticSingleSelector,
+)
+from llama_index.core import PromptTemplate
+from llama_index.core.response_synthesizers import TreeSummarize
+import nest_asyncio
+import asyncio
+nest_asyncio.apply()
+# Load environment variables from the .env file
+load_dotenv()
+# Function to load YAML configuration
+def load_config(config_path):
+    with open(config_path, 'r') as file:
+        config = yaml.safe_load(file)
+    return config
+def load_prompt_template(prompt_template_path):
+    with open(prompt_template_path, 'r') as file:
+        prompt_template = yaml.safe_load(file)
+    return prompt_template
+# Pinecone Index Connection
+def index_connection(config_path):
+    """
+    Initializes the Pinecone client and retrieves the index using the provided YAML configuration.
+    Args:
+    config_path (str): Path to the YAML configuration file.
+    Returns:
+    index: The initialized Pinecone index.
+    """
+    # Load the configuration from a YAML file
+    config = load_config(config_path)
+    embed_model_name = config['embeddings']['model_name']
+    embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
+    model_name = config['model']['model_name']
+    Settings.llm = Groq(model=model_name, api_key=os.getenv('GROQ_API_KEY'))
+    Settings.embed_model = embed_model
+    # Initialize the Pinecone client
+    pc = Pinecone(
+        api_key=os.getenv('PINECONE_API_KEY')  # Get the Pinecone API key from the environment
+    )
+    index_name = config['pinecone']['index_name']
+    summary_index_name = config['pinecone']['summary_index_name']
+    index = pc.Index(index_name)
+    summary_index = pc.Index(summary_index_name)  # Get the Pinecone index using the index name from the config
+    return index,summary_index
+# Initialize Pinecone Vector Store and Retriever
+def initialize_retriever(pinecone_index,summary_index):
+    """
+    Initializes the Pinecone vector store and sets up the retriever.
+    Args:
+    pinecone_index: The Pinecone index object.
+    Returns:
+    retriever: The initialized retriever for querying the vector store.
+    """
+    # Initialize Pinecone Vector Store
+    vector_store = PineconeVectorStore(pinecone_index=pinecone_index, text_key="_node_content")
+    summary_vector_store = PineconeVectorStore(pinecone_index=summary_index, text_key="_node_content")
+    # Create the retriever using the VectorStoreIndex and configure similarity_top_k
+    index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
+    summary_index = VectorStoreIndex.from_vector_store(vector_store=summary_vector_store)
+    return index,summary_index
+# Query the Pinecone Index
+def index_retrieval(index, summary_index, query_text):
+    """
+    Queries the Pinecone index using the provided retriever and query text.
+    Args:
+    retriever: The initialized retriever.
+    query_text (str): The text query to search for.
+    Returns:
+    str: Query result from the Pinecone index.
+    """
+    script_dir = os.path.dirname(os.path.abspath(__file__))  # Get the current script directory
+    base_dir = os.path.dirname(script_dir)
+    prompt_template_path = os.path.join(base_dir, 'model', 'prompt_template.yaml')
+    prompt_template = load_prompt_template(prompt_template_path)
+    QA_PROMPT = PromptTemplate(prompt_template['QA_PROMPT_TMPL'])
+    # Execute the query using the retriever
+    vector_query_engine = index.as_query_engine(text_qa_template=QA_PROMPT)
+    summary_query_engine = summary_index.as_query_engine(text_qa_template=QA_PROMPT)
+    vector_tool = QueryEngineTool.from_defaults(
+    query_engine=vector_query_engine,
+    description="Useful for answering questions about this context",
+    )
+    summary_tool = QueryEngineTool.from_defaults(
+        query_engine=summary_query_engine,
+        description="Useful for answering questions about this context",
+    )
+    tree_summarize = TreeSummarize(
+        summary_template=PromptTemplate(prompt_template['TREE_SUMMARIZE_PROMPT_TMPL'])
+    )
+    query_engine = RouterQueryEngine(
+    selector=LLMMultiSelector.from_defaults(),
+    query_engine_tools=[
+        vector_tool,
+        summary_tool,
+    ],
+    summarizer=tree_summarize,)
+    response = query_engine.query(query_text)
+    return response
+# Example usage
+if __name__ == "__main__":
+    # Dynamically determine the path to the config file
+    script_dir = os.path.dirname(os.path.abspath(__file__))  # Get the current script directory
+    base_dir = os.path.dirname(script_dir)  # Go one level up
+    config_path = os.path.join(base_dir, 'configs', 'config.yaml')  # Path to 'config.yaml' in the 'configs' directory
+    # Step 1: Initialize Pinecone Connection
+    pinecone_index,summary_index = index_connection(config_path=config_path)
+    # Step 2: Initialize the Retriever
+    retriever,summary_retriever = initialize_retriever(pinecone_index,summary_index)
+    # Step 3: Query the Pinecone index
+    query_text = """How much can the Minister of Health pay out of the Consolidated Revenue Fund in relation to coronavirus disease 2019 (COVID-19) tests"""
+    response = index_retrieval(retriever, summary_retriever, query_text)
+    print(response)
+    # Print the result (already printed by pprint_source_node)

task6_model_deployment/scripts/vector_database_creation.py CHANGED Viewed

@@ -1,50 +1,76 @@
-import yaml
-import os
-import os
-from pinecone import Pinecone, ServerlessSpec
-from dotenv import load_dotenv
-load_dotenv()
-script_dir = os.path.dirname(os.path.abspath(__file__))
-# Construct the base directory (one level up from the script directory)
-base_dir = os.path.dirname(script_dir)
-# Construct the path to the config file
-config_path = os.path.join(base_dir, 'configs', 'config.yaml')
-def load_config(file_path):
-    with open(file_path, 'r') as file:
-        config = yaml.safe_load(file)
-    return config
-def creation_of_vector_database():
-    # Load the configuration from a YAML file
-    config = load_config(config_path)
-    # Initialize the Pinecone client
-    pc = Pinecone(
-        api_key=os.getenv('PINECONE_API_KEY'))  # Ensure your API key is set in the environment variables
-    # Connect to the Pinecone index
-    index_name = config['pinecone']['index_name']
-    dimension = config['pinecone']['dimension']
-    metric = config['pinecone']['metric']
-    # file_path = config['file_location']['file_path']
-    cloud = config['pinecone']['cloud']
-    region = config['pinecone']['region']
-    if index_name not in pc.list_indexes().names():
-        pc.create_index(
-            name=index_name,
-            dimension=dimension,
-            metric=metric,
-            spec=ServerlessSpec(
-                cloud=cloud,  # Specify your preferred cloud provider
-                region=region  # Specify your preferred region
-            )
-        )
-if __name__ == "__main__":
-    creation_of_vector_database()

+import yaml
+import os
+import os
+from pinecone import Pinecone, ServerlessSpec
+from dotenv import load_dotenv
+load_dotenv()
+script_dir = os.path.dirname(os.path.abspath(__file__))
+# Construct the base directory (one level up from the script directory)
+base_dir = os.path.dirname(script_dir)
+# Construct the path to the config file
+config_path = os.path.join(base_dir, 'configs', 'config.yaml')
+def load_config(file_path):
+    with open(file_path, 'r') as file:
+        config = yaml.safe_load(file)
+    return config
+def creation_of_vector_database():
+    # Load the configuration from a YAML file
+    config = load_config(config_path)
+    # Initialize the Pinecone client
+    pc = Pinecone(
+        api_key=os.getenv('PINECONE_API_KEY'))  # Ensure your API key is set in the environment variables
+    # Connect to the Pinecone index
+    index_name = config['pinecone']['index_name']
+    dimension = config['pinecone']['dimension']
+    metric = config['pinecone']['metric']
+    # file_path = config['file_location']['file_path']
+    cloud = config['pinecone']['cloud']
+    region = config['pinecone']['region']
+    if index_name not in pc.list_indexes().names():
+        pc.create_index(
+            name=index_name,
+            dimension=dimension,
+            metric=metric,
+            spec=ServerlessSpec(
+                cloud=cloud,  # Specify your preferred cloud provider
+                region=region  # Specify your preferred region
+            )
+        )
+def creation_of_summary_database():
+    # Load the configuration from a YAML file
+    config = load_config(config_path)
+    # Initialize the Pinecone client
+    pc = Pinecone(
+        api_key=os.getenv('PINECONE_API_KEY'))  # Ensure your API key is set in the environment variables
+    # Connect to the Pinecone index
+    index_name = config['pinecone']['summary_index_name']
+    dimension = config['pinecone']['dimension']
+    metric = config['pinecone']['metric']
+    # file_path = config['file_location']['file_path']
+    cloud = config['pinecone']['cloud']
+    region = config['pinecone']['region']
+    if index_name not in pc.list_indexes().names():
+        pc.create_index(
+            name=index_name,
+            dimension=dimension,
+            metric=metric,
+            spec=ServerlessSpec(
+                cloud=cloud,  # Specify your preferred cloud provider
+                region=region  # Specify your preferred region
+            )
+        )
+if __name__ == "__main__":
+    creation_of_vector_database()
+    creation_of_summary_database()

task6_model_deployment/scripts/vector_database_loading.py CHANGED Viewed

@@ -1,91 +1,112 @@
-import yaml
-import os
-import os
-from pinecone import Pinecone, ServerlessSpec
-from dotenv import load_dotenv
-import os
-from dotenv import load_dotenv
-from llama_index.vector_stores.pinecone import PineconeVectorStore
-from llama_index.core import StorageContext, VectorStoreIndex
-from llama_index.core import SimpleDirectoryReader
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-from llama_index.core import Settings
-from llama_index.llms.groq import Groq
-load_dotenv()
-script_dir = os.path.dirname(os.path.abspath(__file__))
-# Construct the base directory (one level up from the script directory)
-base_dir = os.path.dirname(script_dir)
-# Construct the path to the config file
-config_path = os.path.join(base_dir, 'configs', 'config.yaml')
-def load_config(file_path):
-    with open(file_path, 'r') as file:
-        config = yaml.safe_load(file)
-    return config
-def index_connection():
-    # Load the configuration from a YAML file
-    config = load_config(config_path)
-    # Initialize the Pinecone client
-    pc = Pinecone(
-        api_key=os.getenv('PINECONE_API_KEY')
-    )
-    index_name = config['pinecone']['index_name']
-    index = pc.Index(index_name)
-    return index
-def chunk_documents(directory_path="./data/paul_graham"):
-    """
-    Reads documents from a specified directory and chunks them.
-    Args:
-    directory_path (str): The path of the directory containing documents to read.
-    Returns:
-    List[Document]: A list of document chunks that will be indexed.
-    """
-    # Load documents from the directory
-    documents = SimpleDirectoryReader(directory_path).load_data()
-    # Here you could apply further chunking logic if needed (for example, split large documents into smaller chunks)
-    # For now, we're assuming the reader does basic chunking for us
-    return documents
-# Part 2: Loading Chunks into Pinecone
-def load_chunks_into_pinecone(documents):
-    config = load_config(config_path)
-    pinecone_index = index_connection()
-    model_name = config['model']['model_name']
-    embed_model_name = config['embeddings']['model_name']
-    print(embed_model_name)
-    Settings.llm = Groq(model=model_name, api_key=os.getenv('GROQ_API_KEY'))
-    Settings.chunk_size = config['pinecone']['dimension']
-    embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
-    Settings.embed_model = embed_model
-    vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
-    # Create the storage context
-    storage_context = StorageContext.from_defaults(vector_store=vector_store)
-    # Create the index with the documents
-    index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
-    print("Data has been successfully loaded into the Pinecone index!")
-    return index
-# Example usage
-if __name__ == "__main__":
-    # Step 1: Chunk the documents
-    documents = chunk_documents(directory_path=r"C:\Users\agshi\Desktop\Omdena\Canada Policy\TorontoCanadaChapter_CanPolicyInsight\task6_model_deployment\assets")
-    # Step 2: Load the chunks into Pinecone
-    index = load_chunks_into_pinecone(documents)

+import yaml
+import os
+import os
+from pinecone import Pinecone, ServerlessSpec
+from dotenv import load_dotenv
+import os
+from dotenv import load_dotenv
+from llama_index.vector_stores.pinecone import PineconeVectorStore
+from llama_index.core import StorageContext, VectorStoreIndex
+from llama_index.core import SimpleDirectoryReader
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core import Settings
+from llama_index.llms.groq import Groq
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core import DocumentSummaryIndex
+load_dotenv()
+script_dir = os.path.dirname(os.path.abspath(__file__))
+# Construct the base directory (one level up from the script directory)
+base_dir = os.path.dirname(script_dir)
+# Construct the path to the config file
+config_path = os.path.join(base_dir, 'configs', 'config.yaml')
+def load_config(file_path):
+    with open(file_path, 'r') as file:
+        config = yaml.safe_load(file)
+    return config
+def index_connection():
+    # Load the configuration from a YAML file
+    config = load_config(config_path)
+    # Initialize the Pinecone client
+    pc = Pinecone(
+        api_key=os.getenv('PINECONE_API_KEY')
+    )
+    index_name = config['pinecone']['index_name']
+    index = pc.Index(index_name)
+    return index
+def summary_index_connection():
+    # Load the configuration from a YAML file
+    config = load_config(config_path)
+    # Initialize the Pinecone client
+    pc = Pinecone(
+        api_key=os.getenv('PINECONE_API_KEY')
+    )
+    index_name = config['pinecone']['summary_index_name']
+    index = pc.Index(index_name)
+    return index
+def chunk_documents(directory_path="./data/paul_graham"):
+    """
+    Reads documents from a specified directory and chunks them.
+    Args:
+    directory_path (str): The path of the directory containing documents to read.
+    Returns:
+    List[Document]: A list of document chunks that will be indexed.
+    """
+    # Load documents from the directory
+    documents = SimpleDirectoryReader(directory_path).load_data()
+    # Here you could apply further chunking logic if needed (for example, split large documents into smaller chunks)
+    # For now, we're assuming the reader does basic chunking for us
+    return documents
+# Part 2: Loading Chunks into Pinecone
+def load_chunks_into_pinecone(documents):
+    config = load_config(config_path)
+    pinecone_index = index_connection()
+    model_name = config['model']['model_name']
+    embed_model_name = config['embeddings']['model_name']
+    print(embed_model_name)
+    Settings.llm = Groq(model=model_name, api_key=os.getenv('GROQ_API_KEY'))
+    Settings.chunk_size = config['pinecone']['dimension']
+    embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
+    Settings.embed_model = embed_model
+    vector_store = PineconeVectorStore(pinecone_index=pinecone_index,add_sparse_vector=True)
+    # Create the storage context
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+    # Create the index with the documents
+    index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
+    splitter = SentenceSplitter(chunk_size=1024)
+    summary_index = summary_index_connection()
+    summary_vector_store = PineconeVectorStore(pinecone_index=summary_index)
+    summary_storage_context = StorageContext.from_defaults(vector_store=summary_vector_store)
+    summary_index_from_documents = DocumentSummaryIndex.from_documents(documents, transformations=[splitter], storage_context=summary_storage_context,show_progress=True)
+    print("Data has been successfully loaded into the Pinecone index!")
+    return index,summary_index_from_documents
+# Example usage
+if __name__ == "__main__":
+    # Step 1: Chunk the documents
+    documents = chunk_documents(directory_path=r"C:\Users\agshi\Desktop\Omdena\Canada Policy\TorontoCanadaChapter_CanPolicyInsight\task6_model_deployment\assets")
+    # Step 2: Load the chunks into Pinecone
+    index,summary_index_from_documents = load_chunks_into_pinecone(documents)