New-LS-chatbot-app / init_dataset.py
Rulga's picture
feat: initialize dataset structure with chat history and vector store
231b18c
import os
from huggingface_hub import HfApi, create_repo
import json
# Конфигурация
HF_TOKEN = "your_token_here" # Замените на ваш токен
HF_USERNAME = "Rulga"
DATASET_NAME = "LS_chat"
DATASET_REPO = f"{HF_USERNAME}/{DATASET_NAME}"
# Создаем временную структуру
temp_dir = "temp_dataset"
os.makedirs(os.path.join(temp_dir, "chat_history"), exist_ok=True)
os.makedirs(os.path.join(temp_dir, "vector_store"), exist_ok=True)
# Создаем пустые .gitkeep файлы
with open(os.path.join(temp_dir, "chat_history", ".gitkeep"), "w") as f:
pass
with open(os.path.join(temp_dir, "vector_store", ".gitkeep"), "w") as f:
pass
# Создаем README.md с описанием структуры
readme_content = """
# LS Chat Dataset
This dataset contains chat history and vector store for the Status.Law Legal Assistant.
## Structure
- `chat_history/`: Contains daily chat history files
- `vector_store/`: Contains FAISS vector store files
## Usage
This dataset is automatically updated by the Status.Law Legal Assistant application.
"""
with open(os.path.join(temp_dir, "README.md"), "w") as f:
f.write(readme_content)
# Инициализируем и загружаем на Hugging Face
try:
api = HfApi()
# Создаем репозиторий, если он не существует
try:
api.repo_info(repo_id=DATASET_REPO, repo_type="dataset")
print(f"Repository {DATASET_REPO} already exists")
except Exception:
create_repo(DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
print(f"Created new repository {DATASET_REPO}")
# Загружаем структуру
api.upload_folder(
folder_path=temp_dir,
repo_id=DATASET_REPO,
repo_type="dataset",
commit_message="Initialize dataset structure",
token=HF_TOKEN
)
print("Successfully initialized dataset structure!")
except Exception as e:
print(f"Error: {str(e)}")
finally:
# Очищаем временные файлы
import shutil
shutil.rmtree(temp_dir)