File size: 3,836 Bytes
cd7b78b
90a08b2
 
 
9f3c9bf
cd7b78b
 
05055d0
cd7b78b
99afe26
 
 
 
90a08b2
05055d0
 
 
 
cd7b78b
90a08b2
 
 
 
 
 
99afe26
05055d0
 
 
 
cd7b78b
90a08b2
 
 
 
 
 
99afe26
cd7b78b
90a08b2
05055d0
90a08b2
 
 
1f4bbb8
90a08b2
cd7b78b
90a08b2
99afe26
90a08b2
 
cd7b78b
 
 
99afe26
cd7b78b
90a08b2
05055d0
 
 
 
 
 
90a08b2
cd7b78b
90a08b2
05055d0
 
90a08b2
 
 
cd7b78b
 
05055d0
 
cd7b78b
99afe26
cd7b78b
99afe26
05055d0
 
90a08b2
 
99afe26
cd7b78b
05055d0
 
cd7b78b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# ingest.py  – works with LangChain v0.2+
from pathlib import Path
from typing import List

from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings      # optional

class Ingest:
    def __init__(
        self,
        *,
        english_embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
        czech_embedding_model:   str = "Seznam/retromae-small-cs",
        use_openai_embeddings:   bool = False,
        openai_embedding_model:  str = "text-embedding-3-large",
        openai_api_key: str | None = None,
        chunk: int = 512,
        overlap: int = 256,
        english_store: str = "stores/english_512",
        czech_store:   str = "stores/czech_512",
        data_english:  str = "data/english",
        data_czech:    str = "data/czech",
    ):
        self.english_embedding_model = english_embedding_model
        self.czech_embedding_model   = czech_embedding_model
        self.use_openai_embeddings   = use_openai_embeddings
        self.openai_embedding_model  = openai_embedding_model
        self.openai_api_key          = openai_api_key
        self.chunk   = chunk
        self.overlap = overlap
        self.english_store = Path(english_store)
        self.czech_store   = Path(czech_store)
        self.data_english  = Path(data_english)
        self.data_czech    = Path(data_czech)

    # ------------------------------------------------------------------ utils
    @staticmethod
    def _load(folder: Path):
        return DirectoryLoader(
            str(folder),
            recursive=True,
            loader_cls=PyPDFLoader,
            use_multithreading=True,
            show_progress=True,
        ).load()

    @staticmethod
    def _split(docs: List, chunk: int, overlap: int):
        splitter = RecursiveCharacterTextSplitter(chunk_size=chunk,
                                                  chunk_overlap=overlap)
        return splitter.split_documents(docs)

    # ------------------------------------------------------------------ ENG
    def ingest_english(self):
        if self.use_openai_embeddings:
            if not self.openai_api_key:
                raise ValueError("OPENAI_API_KEY missing for OpenAI embeddings.")
            embed = OpenAIEmbeddings(
                openai_api_key=self.openai_api_key,
                model=self.openai_embedding_model,
            )
            mode = f"OpenAI {self.openai_embedding_model}"
        else:
            embed = HuggingFaceEmbeddings(
                model_name=self.english_embedding_model,
                model_kwargs={"device": "cpu"},
                encode_kwargs={"normalize_embeddings": False},
            )
            mode = f"HuggingFace {self.english_embedding_model}"
        print(f"β€’ English ingest with {mode}")
        texts = self._split(self._load(self.data_english), self.chunk, self.overlap)
        FAISS.from_documents(texts, embed).save_local(str(self.english_store))
        print("βœ“ English store saved to", self.english_store)

    # ------------------------------------------------------------------ CZ
    def ingest_czech(self):
        embed = HuggingFaceEmbeddings(
            model_name=self.czech_embedding_model,
            model_kwargs={"device": "cpu"},
            encode_kwargs={"normalize_embeddings": False},
        )
        print(f"β€’ Czech ingest with {self.czech_embedding_model}")
        texts = self._split(self._load(self.data_czech), self.chunk, self.overlap)
        FAISS.from_documents(texts, embed).save_local(str(self.czech_store))
        print("βœ“ Czech store saved to", self.czech_store)