Spaces:
Running
Running
# src/semantic_retriever.py | |
from typing import List, Dict, Any, Tuple, Optional | |
import numpy as np | |
from langchain_community.embeddings import OpenAIEmbeddings | |
from langchain_community.vectorstores import FAISS | |
from langchain.schema import Document | |
from src.ontology_manager import OntologyManager | |
class SemanticRetriever: | |
""" | |
Enhanced retrieval system that combines vector search with ontology awareness. | |
""" | |
def __init__( | |
self, | |
ontology_manager: OntologyManager, | |
embeddings_model = None, | |
text_chunks: Optional[List[str]] = None | |
): | |
""" | |
Initialize the semantic retriever. | |
Args: | |
ontology_manager: The ontology manager instance | |
embeddings_model: The embeddings model to use (defaults to OpenAIEmbeddings) | |
text_chunks: Optional list of text chunks to add to the vector store | |
""" | |
self.ontology_manager = ontology_manager | |
self.embeddings = embeddings_model or OpenAIEmbeddings() | |
# Create a vector store with the text representation of the ontology | |
ontology_text = ontology_manager.get_text_representation() | |
self.ontology_chunks = self._split_text(ontology_text) | |
# Add additional text chunks if provided | |
if text_chunks: | |
self.text_chunks = text_chunks | |
all_chunks = self.ontology_chunks + text_chunks | |
else: | |
self.text_chunks = [] | |
all_chunks = self.ontology_chunks | |
# Convert to Document objects for FAISS | |
documents = [Document(page_content=chunk, metadata={"source": "ontology" if i < len(self.ontology_chunks) else "text"}) | |
for i, chunk in enumerate(all_chunks)] | |
# Create the vector store | |
self.vector_store = FAISS.from_documents(documents, self.embeddings) | |
def _split_text(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]: | |
"""Split text into chunks for embedding.""" | |
chunks = [] | |
text_length = len(text) | |
for i in range(0, text_length, chunk_size - overlap): | |
chunk = text[i:i + chunk_size] | |
if len(chunk) < 50: # Skip very small chunks | |
continue | |
chunks.append(chunk) | |
return chunks | |
def retrieve(self, query: str, k: int = 4, include_ontology_context: bool = True) -> List[Document]: | |
""" | |
Retrieve relevant documents using a hybrid approach. | |
Args: | |
query: The query string | |
k: Number of documents to retrieve | |
include_ontology_context: Whether to include additional ontology context | |
Returns: | |
A list of retrieved documents | |
""" | |
# Get semantic context from the ontology | |
if include_ontology_context: | |
ontology_context = self.ontology_manager.get_semantic_context(query) | |
else: | |
ontology_context = [] | |
# Perform vector similarity search | |
vector_results = self.vector_store.similarity_search(query, k=k) | |
# Combine results | |
combined_results = vector_results | |
# Add ontology context as additional documents | |
for i, context in enumerate(ontology_context): | |
combined_results.append(Document( | |
page_content=context, | |
metadata={"source": "ontology_context", "context_id": i} | |
)) | |
return combined_results | |
def retrieve_with_paths(self, query: str, k: int = 4) -> Dict[str, Any]: | |
""" | |
Enhanced retrieval that includes semantic paths between entities. | |
Args: | |
query: The query string | |
k: Number of documents to retrieve | |
Returns: | |
A dictionary containing retrieved documents and semantic paths | |
""" | |
# Basic retrieval | |
basic_results = self.retrieve(query, k) | |
# Extract potential entities from the query (simplified approach) | |
# A more sophisticated approach would use NER or entity linking | |
entity_types = ["Product", "Department", "Employee", "Manager", "Customer", "Feedback"] | |
query_words = query.lower().split() | |
potential_entities = [] | |
for entity_type in entity_types: | |
if entity_type.lower() in query_words: | |
# Get instances of this type | |
instances = self.ontology_manager.get_instances_of_class(entity_type) | |
if instances: | |
# Just take the first few for demonstration | |
potential_entities.extend(instances[:2]) | |
# Find paths between potential entities | |
paths = [] | |
if len(potential_entities) >= 2: | |
for i in range(len(potential_entities)): | |
for j in range(i+1, len(potential_entities)): | |
source = potential_entities[i] | |
target = potential_entities[j] | |
# Find paths between these entities | |
entity_paths = self.ontology_manager.find_paths(source, target, max_length=3) | |
if entity_paths: | |
for path in entity_paths: | |
# Convert path to text | |
path_text = self._path_to_text(path) | |
paths.append({ | |
"source": source, | |
"target": target, | |
"path": path, | |
"text": path_text | |
}) | |
# Convert paths to documents | |
path_documents = [] | |
for i, path_info in enumerate(paths): | |
path_documents.append(Document( | |
page_content=path_info["text"], | |
metadata={ | |
"source": "semantic_path", | |
"path_id": i, | |
"source_entity": path_info["source"], | |
"target_entity": path_info["target"] | |
} | |
)) | |
return { | |
"documents": basic_results + path_documents, | |
"paths": paths | |
} | |
def _path_to_text(self, path: List[Dict]) -> str: | |
"""Convert a path to a text description.""" | |
if not path: | |
return "" | |
text_parts = [] | |
for edge in path: | |
source = edge["source"] | |
target = edge["target"] | |
relation = edge["type"] | |
# Get entity information | |
source_info = self.ontology_manager.get_entity_info(source) | |
target_info = self.ontology_manager.get_entity_info(target) | |
# Get names if available | |
source_name = source | |
if "properties" in source_info and "name" in source_info["properties"]: | |
source_name = source_info["properties"]["name"] | |
target_name = target | |
if "properties" in target_info and "name" in target_info["properties"]: | |
target_name = target_info["properties"]["name"] | |
# Describe the relationship | |
text_parts.append(f"{source_name} {relation} {target_name}") | |
return " -> ".join(text_parts) | |
def search_by_property(self, class_type: str, property_name: str, property_value: str) -> List[Document]: | |
""" | |
Search for instances of a class with a specific property value. | |
Args: | |
class_type: The class to search in | |
property_name: The property name to match | |
property_value: The property value to match | |
Returns: | |
A list of matched entities as documents | |
""" | |
instances = self.ontology_manager.get_instances_of_class(class_type) | |
results = [] | |
for instance_id in instances: | |
entity_info = self.ontology_manager.get_entity_info(instance_id) | |
if "properties" in entity_info: | |
properties = entity_info["properties"] | |
if property_name in properties: | |
# Simple string matching (could be enhanced with fuzzy matching) | |
if str(properties[property_name]).lower() == property_value.lower(): | |
# Convert to document | |
doc_content = f"Instance: {instance_id}\n" | |
doc_content += f"Type: {class_type}\n" | |
doc_content += "Properties:\n" | |
for prop_name, prop_value in properties.items(): | |
doc_content += f"- {prop_name}: {prop_value}\n" | |
results.append(Document( | |
page_content=doc_content, | |
metadata={ | |
"source": "property_search", | |
"instance_id": instance_id, | |
"class_type": class_type | |
} | |
)) | |
return results |