Spaces:

kgauvin603
/

rag-10k-analysis

Sleeping

App Files Files Community

kgauvin603 commited on Jun 23

Commit

839ea83

•

1 Parent(s): fbbf62f

Rename app.py to app_full.py

Browse files

Files changed (1) hide show

app.py → app_full.py +53 -3

app.py → app_full.py RENAMED Viewed

@@ -28,6 +28,18 @@ import subprocess
 from openai import OpenAI
 from huggingface_hub import HfApi
 from huggingface_hub import CommitScheduler
 from langchain_community.embeddings.sentence_transformer import (
     SentenceTransformerEmbeddings
 )
@@ -63,16 +75,54 @@ embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')
 import subprocess
 # Command to unzip the file
-command = "unzip kgauvin603/rag-10k-analysis/Dataset-10k.zip -d dataset"
 # Execute the command
 try:
     subprocess.run(command, check=True, shell=True)
 except subprocess.CalledProcessError as e:
     print(f"An error occurred: {e}")
 # Provide pdf_folder_location
-pdf_folder_location = "kgauvin603/rag-10k-analysis/dataset"
 # Load the directory to pdf_loader
 pdf_loader = PyPDFDirectoryLoader(pdf_folder_location)

 from openai import OpenAI
 from huggingface_hub import HfApi
 from huggingface_hub import CommitScheduler
+from huggingface_hub import hf_hub_download
+import zipfile
+# Define your repository and file path
+repo_id = "kgauvin603/rag-10k"
+file_path = "dataset.zip"
+# Download the file
+downloaded_file = hf_hub_download(repo_id, file_path)
+# Print the path to the downloaded file
+print(f"Downloaded file is located at: {downloaded_file}")
 from langchain_community.embeddings.sentence_transformer import (
     SentenceTransformerEmbeddings
 )
 import subprocess
 # Command to unzip the file
+#command = "unzip kgauvin603/rag-10k-analysis/Dataset-10k.zip -d dataset"
+command = "pip install transformers huggingface_hub requests"
 # Execute the command
 try:
     subprocess.run(command, check=True, shell=True)
 except subprocess.CalledProcessError as e:
     print(f"An error occurred: {e}")
+from huggingface_hub import hf_hub_download
+import zipfile
+import os
+import requests
 # Provide pdf_folder_location
+repo_id = "kgauvin603/rag-10k"
+file_path = "dataset.zip"
+# Get the URL for the file in the repository
+file_url = f"https://huggingface.co/{repo_id}/resolve/main/{file_path}"
+# Download the file into memory
+response = requests.get(file_url)
+response.raise_for_status()  # Ensure the request was successful
+# Open the zip file in memory
+with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
+    # List the files in the zip archive
+    zip_file_list = zip_ref.namelist()
+    print(f"Files in the zip archive: {zip_file_list}")
+    # Extract specific files or work with them directly in memory
+    # For example, reading a specific file
+    with zip_ref.open('dataset/some_file.txt') as file:
+        file_content = file.read()
+        print(file_content.decode('utf-8'))
+# Define the extraction path
+#extraction_path = "./extracted_files"
+# Create the directory if it doesn't exist
+#os.makedirs(extraction_path, exist_ok=True)
+# Extract the contents of the zip file
+#with zipfile.ZipFile(downloaded_file, 'r') as zip_ref:
+#    zip_ref.extractall(extraction_path)
+# List the files in the extraction path
+#extracted_files = os.listdir(extraction_path)
+#print(f"Extracted files: {extracted_files}")
 # Load the directory to pdf_loader
 pdf_loader = PyPDFDirectoryLoader(pdf_folder_location)