Rulga commited on
Commit
d4835b5
·
1 Parent(s): 5c15f1a

Refactor dataset.py: Update import path for HuggingFaceEmbeddings, streamline DatasetManager initialization, and enhance download_vector_store method with improved error handling and logging.

Browse files
Files changed (1) hide show
  1. src/knowledge_base/dataset.py +119 -143
src/knowledge_base/dataset.py CHANGED
@@ -10,7 +10,7 @@ from datetime import datetime
10
  import logging
11
  from huggingface_hub import HfApi, HfFolder
12
  from langchain_community.vectorstores import FAISS
13
- from langchain_huggingface import HuggingFaceEmbeddings
14
  from config.settings import (
15
  VECTOR_STORE_PATH,
16
  HF_TOKEN,
@@ -26,155 +26,59 @@ from config.settings import (
26
  logger = logging.getLogger(__name__)
27
 
28
  class DatasetManager:
29
- def __init__(self, dataset_name: Optional[str] = None, token: Optional[str] = None):
30
- self.dataset_name = dataset_name or DATASET_ID
31
- self.token = token if token else HF_TOKEN
32
- self.api = HfApi(token=self.token)
33
-
34
- # Use paths from settings
35
- self.vector_store_path = DATASET_VECTOR_STORE_PATH
36
- self.chat_history_path = DATASET_CHAT_HISTORY_PATH
37
- self.fine_tuned_path = DATASET_FINE_TUNED_PATH
38
- self.annotations_path = DATASET_ANNOTATIONS_PATH
39
-
40
- # Добавьте этот метод в класс DatasetManager в файле src/knowledge_base/dataset.py
41
-
42
- def download_vector_store(self) -> Tuple[bool, Union[FAISS, str]]:
43
- """Download vector store from dataset"""
44
- try:
45
- with tempfile.TemporaryDirectory() as temp_dir:
46
- logger.debug(f"Downloading to temporary directory: {temp_dir}")
47
-
48
- try:
49
- # Download vector store files
50
- index_path = self.api.hf_hub_download(
51
- repo_id=self.dataset_name,
52
- filename="vector_store/index.faiss",
53
- repo_type="dataset",
54
- local_dir=temp_dir
55
- )
56
- logger.debug(f"Downloaded index.faiss to: {index_path}")
57
-
58
- config_path = self.api.hf_hub_download(
59
- repo_id=self.dataset_name,
60
- filename="vector_store/index.pkl",
61
- repo_type="dataset",
62
- local_dir=temp_dir
63
- )
64
- logger.debug(f"Downloaded index.pkl to: {config_path}")
65
-
66
- # Initialize embeddings
67
- embeddings = HuggingFaceEmbeddings(
68
- model_name=EMBEDDING_MODEL,
69
- model_kwargs={'device': 'cpu'}
70
- )
71
-
72
- # Load vector store
73
- vector_store = FAISS.load_local(
74
- folder_path=os.path.join(temp_dir, "vector_store"),
75
- embeddings=embeddings
76
- )
77
-
78
- return True, vector_store
79
-
80
- except Exception as e:
81
- logger.error(f"Error downloading vector store: {str(e)}")
82
- return False, f"Error downloading vector store: {str(e)}"
83
-
84
- except Exception as e:
85
- logger.error(f"Error in download_vector_store: {str(e)}")
86
- return False, str(e)
87
 
88
- def get_last_update_date(self):
89
- """
90
- Получает дату последнего обновления базы знаний.
91
-
92
- Returns:
93
- str: Дата последнего обновления в формате ISO или None, если информация недоступна
94
- """
95
- try:
96
- # Попробуем получить метаданные из датасета
97
- api = HfApi(token=self.hf_token)
98
-
99
- # Сначала проверим, есть ли специальный файл метаданных
100
- files = api.list_repo_files(
101
- repo_id=self.dataset_id,
102
- repo_type="dataset"
103
- )
104
-
105
- metadata_file = "vector_store/metadata.json"
106
-
107
- if metadata_file in files:
108
- # Скачиваем файл метаданных
109
- temp_dir = tempfile.mkdtemp()
110
- metadata_path = os.path.join(temp_dir, "metadata.json")
111
-
112
- api.hf_hub_download(
113
- repo_id=self.dataset_id,
114
- repo_type="dataset",
115
- filename=metadata_file,
116
- local_dir=temp_dir,
117
- local_dir_use_symlinks=False
118
- )
119
-
120
- # Открываем и читаем дату из метаданных
121
- with open(metadata_path, 'r') as f:
122
- metadata = json.load(f)
123
- return metadata.get("last_updated", None)
124
-
125
- # Если специальный файл не найден, можно использовать дату последнего коммита
126
- # для директории vector_store
127
- last_commit = api.get_repo_info(
128
- repo_id=self.dataset_id,
129
- repo_type="dataset"
130
- )
131
-
132
- # Получаем дату последнего коммита
133
- if hasattr(last_commit, "lastModified"):
134
- return last_commit.lastModified
135
-
136
- return None
137
- except Exception as e:
138
- logger.error(f"Error getting last update date: {str(e)}")
139
- return None
140
-
141
- def init_dataset_structure(self) -> Tuple[bool, str]:
142
- """
143
- Initialize dataset structure with required directories
144
-
145
- Returns:
146
- (success, message)
147
- """
148
  try:
149
- # Check if repository exists
150
- try:
151
- self.api.repo_info(repo_id=self.dataset_name, repo_type="dataset")
152
- except Exception:
153
- # Create repository if it doesn't exist
154
- self.api.create_repo(repo_id=self.dataset_name, repo_type="dataset", private=True)
155
-
156
- # Create empty .gitkeep files to maintain structure
157
- directories = ["vector_store", "chat_history", "documents"]
158
-
159
- for directory in directories:
160
- with tempfile.NamedTemporaryFile(delete=False) as temp:
161
- temp_path = temp.name
162
 
163
  try:
164
- self.api.upload_file(
165
- path_or_fileobj=temp_path,
166
- path_in_repo=f"{directory}/.gitkeep",
167
  repo_id=self.dataset_name,
168
- repo_type="dataset"
 
 
169
  )
170
- finally:
171
- if os.path.exists(temp_path):
172
- os.remove(temp_path)
173
-
174
- return True, "Dataset structure initialized successfully"
175
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  except Exception as e:
177
- return False, f"Error initializing dataset structure: {str(e)}"
 
178
 
179
  def upload_vector_store(self, vector_store: FAISS) -> Tuple[bool, str]:
180
  """
@@ -285,6 +189,78 @@ def get_last_update_date(self):
285
  logger.error(f"Error uploading vector store: {str(e)}")
286
  return False, f"Error uploading vector store: {str(e)}"
287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  def download_vector_store(self) -> Tuple[bool, Union[FAISS, str]]:
289
  """Download vector store from dataset"""
290
  try:
 
10
  import logging
11
  from huggingface_hub import HfApi, HfFolder
12
  from langchain_community.vectorstores import FAISS
13
+ from langchain_community.embeddings import HuggingFaceEmbeddings
14
  from config.settings import (
15
  VECTOR_STORE_PATH,
16
  HF_TOKEN,
 
26
  logger = logging.getLogger(__name__)
27
 
28
  class DatasetManager:
29
+ def __init__(self, token: str = None, dataset_id: str = None):
30
+ """Initialize dataset manager"""
31
+ self.hf_token = token or HF_TOKEN
32
+ self.dataset_id = dataset_id or DATASET_ID
33
+ self.dataset_name = self.dataset_id
34
+ self.api = HfApi(token=self.hf_token)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ def download_vector_store(self) -> Tuple[bool, Union[FAISS, str]]:
37
+ """Download vector store from dataset"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  try:
39
+ with tempfile.TemporaryDirectory() as temp_dir:
40
+ logger.debug(f"Downloading to temporary directory: {temp_dir}")
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  try:
43
+ # Download vector store files
44
+ index_path = self.api.hf_hub_download(
 
45
  repo_id=self.dataset_name,
46
+ filename="vector_store/index.faiss",
47
+ repo_type="dataset",
48
+ local_dir=temp_dir
49
  )
50
+ logger.debug(f"Downloaded index.faiss to: {index_path}")
51
+
52
+ config_path = self.api.hf_hub_download(
53
+ repo_id=self.dataset_name,
54
+ filename="vector_store/index.pkl",
55
+ repo_type="dataset",
56
+ local_dir=temp_dir
57
+ )
58
+ logger.debug(f"Downloaded index.pkl to: {config_path}")
59
+
60
+ # Initialize embeddings
61
+ embeddings = HuggingFaceEmbeddings(
62
+ model_name=EMBEDDING_MODEL,
63
+ model_kwargs={'device': 'cpu'}
64
+ )
65
+
66
+ # Load vector store
67
+ vector_store = FAISS.load_local(
68
+ folder_path=os.path.dirname(index_path),
69
+ embeddings=embeddings,
70
+ allow_dangerous_deserialization=True
71
+ )
72
+
73
+ return True, vector_store
74
+
75
+ except Exception as e:
76
+ logger.error(f"Error downloading vector store: {str(e)}")
77
+ return False, f"Error downloading vector store: {str(e)}"
78
+
79
  except Exception as e:
80
+ logger.error(f"Error in download_vector_store: {str(e)}")
81
+ return False, str(e)
82
 
83
  def upload_vector_store(self, vector_store: FAISS) -> Tuple[bool, str]:
84
  """
 
189
  logger.error(f"Error uploading vector store: {str(e)}")
190
  return False, f"Error uploading vector store: {str(e)}"
191
 
192
+ def get_last_update_date(self) -> Optional[str]:
193
+ """
194
+ Get the date of last knowledge base update
195
+
196
+ Returns:
197
+ str: Last update date in ISO format or None if not found
198
+ """
199
+ try:
200
+ # Try to get metadata from dataset
201
+ files = self.api.list_repo_files(
202
+ repo_id=self.dataset_id,
203
+ repo_type="dataset"
204
+ )
205
+
206
+ if "vector_store/metadata.json" in files:
207
+ try:
208
+ metadata_file = self.api.hf_hub_download(
209
+ repo_id=self.dataset_id,
210
+ filename="vector_store/metadata.json",
211
+ repo_type="dataset"
212
+ )
213
+
214
+ with open(metadata_file, 'r') as f:
215
+ metadata = json.load(f)
216
+ return metadata.get("last_update")
217
+ except:
218
+ return None
219
+
220
+ return None
221
+
222
+ except Exception as e:
223
+ logger.error(f"Error getting last update date: {str(e)}")
224
+ return None
225
+
226
+ def init_dataset_structure(self) -> Tuple[bool, str]:
227
+ """
228
+ Initialize dataset structure with required directories
229
+
230
+ Returns:
231
+ (success, message)
232
+ """
233
+ try:
234
+ # Check if repository exists
235
+ try:
236
+ self.api.repo_info(repo_id=self.dataset_name, repo_type="dataset")
237
+ except Exception:
238
+ # Create repository if it doesn't exist
239
+ self.api.create_repo(repo_id=self.dataset_name, repo_type="dataset", private=True)
240
+
241
+ # Create empty .gitkeep files to maintain structure
242
+ directories = ["vector_store", "chat_history", "documents"]
243
+
244
+ for directory in directories:
245
+ with tempfile.NamedTemporaryFile(delete=False) as temp:
246
+ temp_path = temp.name
247
+
248
+ try:
249
+ self.api.upload_file(
250
+ path_or_fileobj=temp_path,
251
+ path_in_repo=f"{directory}/.gitkeep",
252
+ repo_id=self.dataset_name,
253
+ repo_type="dataset"
254
+ )
255
+ finally:
256
+ if os.path.exists(temp_path):
257
+ os.remove(temp_path)
258
+
259
+ return True, "Dataset structure initialized successfully"
260
+
261
+ except Exception as e:
262
+ return False, f"Error initializing dataset structure: {str(e)}"
263
+
264
  def download_vector_store(self) -> Tuple[bool, Union[FAISS, str]]:
265
  """Download vector store from dataset"""
266
  try: