import gradio as gr # from huggingface_hub import InferenceClient from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader from sentence_transformers import SentenceTransformer from huggingface_hub import Repository, upload_file # from langchain.vectorstores import FAISS import faiss import pandas as pd from datasets import Dataset import os DATA_PATH='./data' HF_TOKEN = os.getenv('HF_Token') #DB_FAISS_PATH="./vectorstores/db_faiss" #folder_path="../database" def create_vector_db(): loader = DirectoryLoader(DATA_PATH, glob='*.pdf', loader_cls=PyPDFLoader, show_progress=True) documents =loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50) texts = text_splitter.split_documents(documents) print(texts[3]) embedding_model = SentenceTransformer("all-MiniLM-L6-v2") df = pd.DataFrame(texts) column_headers = list(df.columns.values) print(column_headers) pd.options.display.max_colwidth = 300 print(df.iloc[[3]]) df = df.drop(columns=[1, 2]) print(df.iloc[[3]]) df[0] = df[0].astype('string', errors='raise').copy() datatypes = df.dtypes print(datatypes) df[0] = df[0].str[18:] df[0] = df[0].str[:-2] # df['page_content'] = df['page_content'].map(lambda x: x.rstrip(''')) # df['page_content'] = df['page_content'].map(lambda x: x.lstrip('(page_content, ')) print(df.iloc[[3]]) df['embeddings'] = df[0].apply(lambda x: embedding_model.encode(x)) # add_embeddings as a new column print("check1a") print(df.iloc[[3]]) dataset = Dataset.from_pandas(df) print("check2b") print(dataset[3]) dataset.push_to_hub("Namitg02/ADASOF24",token = HF_TOKEN) print("check2c") # embedding_dim = embedding_model.get_sentence_embedding_dimension() # Returns dimensions of embedidng # print(embedding_dim) # index = faiss.IndexFlatL2(embedding_dim) # dataset.save_to_disk("ADASOF24.hf") # datawithIndex = data.add_faiss_index("embeddings", custom_index=index) # print("check3") # data.save_local(folder_path= DB_FAISS_PATH) if __name__ == "__main__": print("check31") create_vector_db()