Spaces:
Running
Running
| import pandas as pd | |
| from buster.documents_manager import DeepLakeDocumentsManager | |
| if __name__ == "__main__": | |
| vector_store_path = "wiki_tai_langchain" | |
| chunk_file = "./data/wiki_tai_langchain.csv" | |
| overwrite = True | |
| df = pd.read_csv(chunk_file) | |
| print(f"before drop: {len(df)}") | |
| df = df.dropna() | |
| print(f"after drop: {len(df)}") | |
| dm = DeepLakeDocumentsManager(vector_store_path, overwrite=overwrite, required_columns=["url", "source", "content", "title"]) | |
| dm.batch_add(df) | |
| zipped_file_path = dm.to_zip() | |
| print(f"Contents zipped to: {zipped_file_path}") | |