# ingest.py – works with LangChain v0.2+ from pathlib import Path from typing import List from langchain_community.vectorstores import FAISS from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader from langchain_huggingface.embeddings import HuggingFaceEmbeddings from langchain_openai import OpenAIEmbeddings # optional class Ingest: def __init__( self, *, english_embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2", czech_embedding_model: str = "Seznam/retromae-small-cs", use_openai_embeddings: bool = False, openai_embedding_model: str = "text-embedding-3-large", openai_api_key: str | None = None, chunk: int = 512, overlap: int = 256, english_store: str = "stores/english_512", czech_store: str = "stores/czech_512", data_english: str = "data/english", data_czech: str = "data/czech", ): self.english_embedding_model = english_embedding_model self.czech_embedding_model = czech_embedding_model self.use_openai_embeddings = use_openai_embeddings self.openai_embedding_model = openai_embedding_model self.openai_api_key = openai_api_key self.chunk = chunk self.overlap = overlap self.english_store = Path(english_store) self.czech_store = Path(czech_store) self.data_english = Path(data_english) self.data_czech = Path(data_czech) # ------------------------------------------------------------------ utils @staticmethod def _load(folder: Path): return DirectoryLoader( str(folder), recursive=True, loader_cls=PyPDFLoader, use_multithreading=True, show_progress=True, ).load() @staticmethod def _split(docs: List, chunk: int, overlap: int): splitter = RecursiveCharacterTextSplitter(chunk_size=chunk, chunk_overlap=overlap) return splitter.split_documents(docs) # ------------------------------------------------------------------ ENG def ingest_english(self): if self.use_openai_embeddings: if not self.openai_api_key: raise ValueError("OPENAI_API_KEY missing for OpenAI embeddings.") embed = OpenAIEmbeddings( openai_api_key=self.openai_api_key, model=self.openai_embedding_model, ) mode = f"OpenAI {self.openai_embedding_model}" else: embed = HuggingFaceEmbeddings( model_name=self.english_embedding_model, model_kwargs={"device": "cpu"}, encode_kwargs={"normalize_embeddings": False}, ) mode = f"HuggingFace {self.english_embedding_model}" print(f"• English ingest with {mode}") texts = self._split(self._load(self.data_english), self.chunk, self.overlap) FAISS.from_documents(texts, embed).save_local(str(self.english_store)) print("✓ English store saved to", self.english_store) # ------------------------------------------------------------------ CZ def ingest_czech(self): embed = HuggingFaceEmbeddings( model_name=self.czech_embedding_model, model_kwargs={"device": "cpu"}, encode_kwargs={"normalize_embeddings": False}, ) print(f"• Czech ingest with {self.czech_embedding_model}") texts = self._split(self._load(self.data_czech), self.chunk, self.overlap) FAISS.from_documents(texts, embed).save_local(str(self.czech_store)) print("✓ Czech store saved to", self.czech_store)