Spaces:
Runtime error
Runtime error
import gradio as gr | |
# from huggingface_hub import InferenceClient | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader | |
from sentence_transformers import SentenceTransformer | |
from huggingface_hub import Repository, upload_file | |
# from langchain.vectorstores import FAISS | |
import faiss | |
import pandas as pd | |
from datasets import Dataset | |
import os | |
DATA_PATH_SOC='./data/SOCNIN' | |
DATA_PATH_DIET1='./data/Krause 1' | |
DATA_PATH_DIET2='./data/Krause 2' | |
DATA_PATH_DIET3='./data/Krause 3' | |
HF_TOKEN = os.getenv('HF_Token') | |
#DB_FAISS_PATH="./vectorstores/db_faiss" | |
#folder_path="../database" | |
def create_vector_db(): | |
# Create embeddings of SOC folder | |
# loader = DirectoryLoader(DATA_PATH_SOC, glob='*.pdf', loader_cls=PyPDFLoader, show_progress=True) | |
# documents =loader.load() | |
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=350, chunk_overlap=70) | |
# texts = text_splitter.split_documents(documents) | |
# print(texts[3]) | |
# embedding_model = SentenceTransformer("all-MiniLM-L6-v2") | |
# df = pd.DataFrame(texts) | |
# column_headers = list(df.columns.values) | |
# print(column_headers) | |
# pd.options.display.max_colwidth = 300 | |
# print(df.iloc[[3]]) | |
# df = df.drop(columns=[1, 2]) | |
# print(df.iloc[[3]]) | |
# df[0] = df[0].astype('string', errors='raise').copy() | |
# datatypes = df.dtypes | |
# print(datatypes) | |
# df[0] = df[0].str[18:] | |
# df[0] = df[0].str[:-2] | |
# df['page_content'] = df['page_content'].map(lambda x: x.rstrip(''')) | |
# df['page_content'] = df['page_content'].map(lambda x: x.lstrip('(page_content, ')) | |
# print(df.iloc[[3]]) | |
# df['embeddings'] = df[0].apply(lambda x: embedding_model.encode(x)) | |
# add_embeddings as a new column | |
print("check1a") | |
# print(df.iloc[[3]]) | |
# dataset = Dataset.from_pandas(df) | |
# print("check2b") | |
# print(dataset[3]) | |
# dataset.push_to_hub("Namitg02/ADASOF24",token = HF_TOKEN) | |
print("check2c") | |
# Create embeddings of Krause 1 folder | |
# loader = DirectoryLoader(DATA_PATH_DIET1, glob='*.pdf', loader_cls=PyPDFLoader, show_progress=True) | |
# documents =loader.load() | |
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=350, chunk_overlap=70) | |
# texts = text_splitter.split_documents(documents) | |
# print(texts[3]) | |
# embedding_model = SentenceTransformer("all-MiniLM-L6-v2") | |
# df = pd.DataFrame(texts) | |
# | |
# column_headers = list(df.columns.values) | |
# print(column_headers) | |
# pd.options.display.max_colwidth = 300 | |
# print(df.iloc[[3]]) | |
# df = df.drop(columns=[1, 2]) | |
# print(df.iloc[[3]]) | |
# df[0] = df[0].astype('string', errors='raise').copy() | |
# datatypes = df.dtypes | |
# print(datatypes) | |
# df[0] = df[0].str[18:] | |
# df[0] = df[0].str[:-2] | |
# print(df.iloc[[3]]) | |
# df['embeddings'] = df[0].apply(lambda x: embedding_model.encode(x)) | |
# add_embeddings as a new column | |
print("check4a") | |
# print(df.iloc[[3]]) | |
# datasetdiet1 = Dataset.from_pandas(df) | |
# print("check4b") | |
# print(dataset[3]) | |
# datasetdiet1.push_to_hub("Namitg02/Krause1",token = HF_TOKEN) | |
# Create embeddings of Krause 2 folder | |
# loader = DirectoryLoader(DATA_PATH_DIET2, glob='*.pdf', loader_cls=PyPDFLoader, show_progress=True) | |
# documents =loader.load() | |
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=350, chunk_overlap=70) | |
# texts = text_splitter.split_documents(documents) | |
# print(texts[3]) | |
# embedding_model = SentenceTransformer("all-MiniLM-L6-v2") | |
# df = pd.DataFrame(texts) | |
# column_headers = list(df.columns.values) | |
# print(column_headers) | |
# pd.options.display.max_colwidth = 300 | |
# print(df.iloc[[3]]) | |
# df = df.drop(columns=[1, 2]) | |
# print(df.iloc[[3]]) | |
# df[0] = df[0].astype('string', errors='raise').copy() | |
# datatypes = df.dtypes | |
# print(datatypes) | |
# df[0] = df[0].str[18:] | |
# df[0] = df[0].str[:-2] | |
# print(df.iloc[[3]]) | |
# | |
# df['embeddings'] = df[0].apply(lambda x: embedding_model.encode(x)) | |
# add_embeddings as a new column | |
# print("check4c") | |
# print(df.iloc[[3]]) | |
# datasetdiet2 = Dataset.from_pandas(df) | |
print("check4d") | |
# print(dataset[3]) | |
# datasetdiet2.push_to_hub("Namitg02/Krause2",token = HF_TOKEN) | |
# Create embeddings of Krause 3 folder | |
loader = DirectoryLoader(DATA_PATH_DIET3, glob='*.pdf', loader_cls=PyPDFLoader, show_progress=True) | |
documents =loader.load() | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=350, chunk_overlap=70) | |
texts = text_splitter.split_documents(documents) | |
print(texts[3]) | |
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") | |
df = pd.DataFrame(texts) | |
column_headers = list(df.columns.values) | |
print(column_headers) | |
pd.options.display.max_colwidth = 300 | |
print(df.iloc[[3]]) | |
df = df.drop(columns=[1, 2]) | |
print(df.iloc[[3]]) | |
df[0] = df[0].astype('string', errors='raise').copy() | |
datatypes = df.dtypes | |
print(datatypes) | |
df[0] = df[0].str[18:] | |
df[0] = df[0].str[:-2] | |
print(df.iloc[[3]]) | |
df['embeddings'] = df[0].apply(lambda x: embedding_model.encode(x)) | |
# add_embeddings as a new column | |
print("check4e") | |
print(df.iloc[[3]]) | |
datasetdiet3 = Dataset.from_pandas(df) | |
print("check4f") | |
print(dataset[3]) | |
datasetdiet3.push_to_hub("Namitg02/Krause3",token = HF_TOKEN) | |
print("combined pdf dataset uploaded") | |
if __name__ == "__main__": | |
print("check31") | |
create_vector_db() |