Spaces:
Sleeping
Sleeping
# Import the necessary libraries | |
import subprocess | |
import sys | |
# Function to install a package using pip | |
def install(package): | |
subprocess.check_call([sys.executable, "-m", "pip", "install", package]) | |
# Install required packages | |
try: | |
install("gradio") | |
install("openai==1.23.2") | |
install("tiktoken==0.6.0") | |
install("pypdf==4.0.1") | |
install("langchain==0.1.1") | |
install("langchain-community==0.0.13") | |
install("chromadb==0.4.22") | |
install("sentence-transformers==2.3.1") | |
except subprocess.CalledProcessError as e: | |
print(f"An error occurred: {e}") | |
import gradio as gr | |
import os | |
import uuid | |
import json | |
import pandas as pd | |
import subprocess | |
from openai import OpenAI | |
from huggingface_hub import HfApi | |
from huggingface_hub import CommitScheduler | |
from huggingface_hub import hf_hub_download | |
import zipfile | |
# Define your repository and file path | |
repo_id = "kgauvin603/rag-10k" | |
#file_path = "dataset.zip" | |
# Download the file | |
#downloaded_file = hf_hub_download(repo_id, file_path) | |
# Print the path to the downloaded file | |
#print(f"Downloaded file is located at: {downloaded_file}") | |
from langchain_community.embeddings.sentence_transformer import ( | |
SentenceTransformerEmbeddings | |
) | |
from langchain_community.vectorstores import Chroma | |
#from google.colab import userdata, drive | |
from pathlib import Path | |
from langchain.document_loaders import PyPDFDirectoryLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
import json | |
import tiktoken | |
import pandas as pd | |
import tiktoken | |
print(f"Pass 1") | |
# Define the embedding model and the vectorstore | |
embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large') | |
# If dataset directory exixts, remove it and all of the contents within | |
#if os.path.exists('dataset'): | |
# !rm -rf dataset | |
# If collection_db exists, remove it and all of the contents within | |
#if os.path.exists('collection_db'): | |
# !rm -rf dataset | |
#Mount the Google Drive | |
#drive.mount('/content/drive') | |
#Upload Dataset-10k.zip and unzip it dataset folder using -d option | |
#!unzip Dataset-10k.zip -d dataset | |
import subprocess | |
# Command to unzip the file | |
#command = "unzip kgauvin603/10k-reports/Dataset-10k.zip -d dataset" | |
command = "pip install transformers huggingface_hub requests" | |
# Execute the command | |
try: | |
subprocess.run(command, check=True, shell=True) | |
except subprocess.CalledProcessError as e: | |
print(f"An error occurred: {e}") | |
from huggingface_hub import hf_hub_download | |
import zipfile | |
import os | |
import requests | |
print(f"Pass 2") | |
#repo_id = "kgauvin603/10k-reports" | |
#file_path = "dataset" | |
# Get the URL for the file in the repository | |
#file_url = f"https://huggingface.co/{repo_id}/resolve/main/{file_path}" | |
#print(file_url) | |
# Command to unzip the file | |
#command = "unzip kgauvin603/10k-reports/Dataset-10k.zip -d dataset" | |
# Execute the command | |
#try: | |
# subprocess.run(command, check=True, shell=True) | |
#except subprocess.CalledProcessError as e: | |
# print(f"An error occurred: {e}") | |
#https://huggingface.co/datasets/kgauvin603/10k-reports | |
# Define the repository and file path | |
repo_id = "kgauvin603/10k-reports" | |
file_path = "Dataset-10k.zip" | |
# Construct the URL for the file in the repository | |
file_url = f"https://huggingface.co/datasets/{repo_id}/{file_path}" | |
print(f"File URL: {file_url}") | |
# Download the zip file | |
response = requests.get(file_url) | |
response.raise_for_status() # Ensure the request was successful | |
# Unzip the file in memory | |
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref: | |
# List the files in the zip archive | |
zip_file_list = zip_ref.namelist() | |
print(f"Files in the zip archive: {zip_file_list}") | |
# Extract specific files or work with them directly in memory | |
for file_name in zip_file_list: | |
with zip_ref.open(file_name) as file: | |
content = file.read() | |
print(f"Content of {file_name}: {content[:100]}...") # Print the first 100 characters of each file | |
# If you need to save the extracted files to disk, you can do so as follows: | |
# Define the extraction path | |
extraction_path = "./dataset" | |
import os | |