Spaces:
Runtime error
Runtime error
File size: 3,836 Bytes
cd7b78b 90a08b2 9f3c9bf cd7b78b 05055d0 cd7b78b 99afe26 90a08b2 05055d0 cd7b78b 90a08b2 99afe26 05055d0 cd7b78b 90a08b2 99afe26 cd7b78b 90a08b2 05055d0 90a08b2 1f4bbb8 90a08b2 cd7b78b 90a08b2 99afe26 90a08b2 cd7b78b 99afe26 cd7b78b 90a08b2 05055d0 90a08b2 cd7b78b 90a08b2 05055d0 90a08b2 cd7b78b 05055d0 cd7b78b 99afe26 cd7b78b 99afe26 05055d0 90a08b2 99afe26 cd7b78b 05055d0 cd7b78b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
# ingest.py β works with LangChain v0.2+
from pathlib import Path
from typing import List
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings # optional
class Ingest:
def __init__(
self,
*,
english_embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
czech_embedding_model: str = "Seznam/retromae-small-cs",
use_openai_embeddings: bool = False,
openai_embedding_model: str = "text-embedding-3-large",
openai_api_key: str | None = None,
chunk: int = 512,
overlap: int = 256,
english_store: str = "stores/english_512",
czech_store: str = "stores/czech_512",
data_english: str = "data/english",
data_czech: str = "data/czech",
):
self.english_embedding_model = english_embedding_model
self.czech_embedding_model = czech_embedding_model
self.use_openai_embeddings = use_openai_embeddings
self.openai_embedding_model = openai_embedding_model
self.openai_api_key = openai_api_key
self.chunk = chunk
self.overlap = overlap
self.english_store = Path(english_store)
self.czech_store = Path(czech_store)
self.data_english = Path(data_english)
self.data_czech = Path(data_czech)
# ------------------------------------------------------------------ utils
@staticmethod
def _load(folder: Path):
return DirectoryLoader(
str(folder),
recursive=True,
loader_cls=PyPDFLoader,
use_multithreading=True,
show_progress=True,
).load()
@staticmethod
def _split(docs: List, chunk: int, overlap: int):
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk,
chunk_overlap=overlap)
return splitter.split_documents(docs)
# ------------------------------------------------------------------ ENG
def ingest_english(self):
if self.use_openai_embeddings:
if not self.openai_api_key:
raise ValueError("OPENAI_API_KEY missing for OpenAI embeddings.")
embed = OpenAIEmbeddings(
openai_api_key=self.openai_api_key,
model=self.openai_embedding_model,
)
mode = f"OpenAI {self.openai_embedding_model}"
else:
embed = HuggingFaceEmbeddings(
model_name=self.english_embedding_model,
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": False},
)
mode = f"HuggingFace {self.english_embedding_model}"
print(f"β’ English ingest with {mode}")
texts = self._split(self._load(self.data_english), self.chunk, self.overlap)
FAISS.from_documents(texts, embed).save_local(str(self.english_store))
print("β English store saved to", self.english_store)
# ------------------------------------------------------------------ CZ
def ingest_czech(self):
embed = HuggingFaceEmbeddings(
model_name=self.czech_embedding_model,
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": False},
)
print(f"β’ Czech ingest with {self.czech_embedding_model}")
texts = self._split(self._load(self.data_czech), self.chunk, self.overlap)
FAISS.from_documents(texts, embed).save_local(str(self.czech_store))
print("β Czech store saved to", self.czech_store)
|