DiabetesPilot / app.py
Namitg02's picture
Update app.py
dfaf2ba verified
import gradio as gr
# from huggingface_hub import InferenceClient
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from sentence_transformers import SentenceTransformer
from huggingface_hub import Repository, upload_file
# from langchain.vectorstores import FAISS
import faiss
import pandas as pd
from datasets import Dataset
import os
DATA_PATH_SOC='./data/SOCNIN'
DATA_PATH_DIET1='./data/Krause 1'
DATA_PATH_DIET2='./data/Krause 2'
DATA_PATH_DIET3='./data/Krause 3'
HF_TOKEN = os.getenv('HF_Token')
#DB_FAISS_PATH="./vectorstores/db_faiss"
#folder_path="../database"
def create_vector_db():
# Create embeddings of SOC folder
# loader = DirectoryLoader(DATA_PATH_SOC, glob='*.pdf', loader_cls=PyPDFLoader, show_progress=True)
# documents =loader.load()
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=350, chunk_overlap=70)
# texts = text_splitter.split_documents(documents)
# print(texts[3])
# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# df = pd.DataFrame(texts)
# column_headers = list(df.columns.values)
# print(column_headers)
# pd.options.display.max_colwidth = 300
# print(df.iloc[[3]])
# df = df.drop(columns=[1, 2])
# print(df.iloc[[3]])
# df[0] = df[0].astype('string', errors='raise').copy()
# datatypes = df.dtypes
# print(datatypes)
# df[0] = df[0].str[18:]
# df[0] = df[0].str[:-2]
# df['page_content'] = df['page_content'].map(lambda x: x.rstrip('''))
# df['page_content'] = df['page_content'].map(lambda x: x.lstrip('(page_content, '))
# print(df.iloc[[3]])
# df['embeddings'] = df[0].apply(lambda x: embedding_model.encode(x))
# add_embeddings as a new column
print("check1a")
# print(df.iloc[[3]])
# dataset = Dataset.from_pandas(df)
# print("check2b")
# print(dataset[3])
# dataset.push_to_hub("Namitg02/ADASOF24",token = HF_TOKEN)
print("check2c")
# Create embeddings of Krause 1 folder
# loader = DirectoryLoader(DATA_PATH_DIET1, glob='*.pdf', loader_cls=PyPDFLoader, show_progress=True)
# documents =loader.load()
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=350, chunk_overlap=70)
# texts = text_splitter.split_documents(documents)
# print(texts[3])
# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# df = pd.DataFrame(texts)
#
# column_headers = list(df.columns.values)
# print(column_headers)
# pd.options.display.max_colwidth = 300
# print(df.iloc[[3]])
# df = df.drop(columns=[1, 2])
# print(df.iloc[[3]])
# df[0] = df[0].astype('string', errors='raise').copy()
# datatypes = df.dtypes
# print(datatypes)
# df[0] = df[0].str[18:]
# df[0] = df[0].str[:-2]
# print(df.iloc[[3]])
# df['embeddings'] = df[0].apply(lambda x: embedding_model.encode(x))
# add_embeddings as a new column
print("check4a")
# print(df.iloc[[3]])
# datasetdiet1 = Dataset.from_pandas(df)
# print("check4b")
# print(dataset[3])
# datasetdiet1.push_to_hub("Namitg02/Krause1",token = HF_TOKEN)
# Create embeddings of Krause 2 folder
# loader = DirectoryLoader(DATA_PATH_DIET2, glob='*.pdf', loader_cls=PyPDFLoader, show_progress=True)
# documents =loader.load()
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=350, chunk_overlap=70)
# texts = text_splitter.split_documents(documents)
# print(texts[3])
# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# df = pd.DataFrame(texts)
# column_headers = list(df.columns.values)
# print(column_headers)
# pd.options.display.max_colwidth = 300
# print(df.iloc[[3]])
# df = df.drop(columns=[1, 2])
# print(df.iloc[[3]])
# df[0] = df[0].astype('string', errors='raise').copy()
# datatypes = df.dtypes
# print(datatypes)
# df[0] = df[0].str[18:]
# df[0] = df[0].str[:-2]
# print(df.iloc[[3]])
#
# df['embeddings'] = df[0].apply(lambda x: embedding_model.encode(x))
# add_embeddings as a new column
# print("check4c")
# print(df.iloc[[3]])
# datasetdiet2 = Dataset.from_pandas(df)
print("check4d")
# print(dataset[3])
# datasetdiet2.push_to_hub("Namitg02/Krause2",token = HF_TOKEN)
# Create embeddings of Krause 3 folder
loader = DirectoryLoader(DATA_PATH_DIET3, glob='*.pdf', loader_cls=PyPDFLoader, show_progress=True)
documents =loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=350, chunk_overlap=70)
texts = text_splitter.split_documents(documents)
print(texts[3])
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
df = pd.DataFrame(texts)
column_headers = list(df.columns.values)
print(column_headers)
pd.options.display.max_colwidth = 300
print(df.iloc[[3]])
df = df.drop(columns=[1, 2])
print(df.iloc[[3]])
df[0] = df[0].astype('string', errors='raise').copy()
datatypes = df.dtypes
print(datatypes)
df[0] = df[0].str[18:]
df[0] = df[0].str[:-2]
print(df.iloc[[3]])
df['embeddings'] = df[0].apply(lambda x: embedding_model.encode(x))
# add_embeddings as a new column
print("check4e")
print(df.iloc[[3]])
datasetdiet3 = Dataset.from_pandas(df)
print("check4f")
print(dataset[3])
datasetdiet3.push_to_hub("Namitg02/Krause3",token = HF_TOKEN)
print("combined pdf dataset uploaded")
if __name__ == "__main__":
print("check31")
create_vector_db()