Spaces:
Sleeping
Sleeping
kgauvin603
commited on
Commit
•
839ea83
1
Parent(s):
fbbf62f
Rename app.py to app_full.py
Browse files- app.py → app_full.py +53 -3
app.py → app_full.py
RENAMED
@@ -28,6 +28,18 @@ import subprocess
|
|
28 |
from openai import OpenAI
|
29 |
from huggingface_hub import HfApi
|
30 |
from huggingface_hub import CommitScheduler
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
from langchain_community.embeddings.sentence_transformer import (
|
32 |
SentenceTransformerEmbeddings
|
33 |
)
|
@@ -63,16 +75,54 @@ embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')
|
|
63 |
import subprocess
|
64 |
|
65 |
# Command to unzip the file
|
66 |
-
command = "unzip kgauvin603/rag-10k-analysis/Dataset-10k.zip -d dataset"
|
67 |
-
|
68 |
# Execute the command
|
69 |
try:
|
70 |
subprocess.run(command, check=True, shell=True)
|
71 |
except subprocess.CalledProcessError as e:
|
72 |
print(f"An error occurred: {e}")
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
# Provide pdf_folder_location
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
# Load the directory to pdf_loader
|
78 |
pdf_loader = PyPDFDirectoryLoader(pdf_folder_location)
|
|
|
28 |
from openai import OpenAI
|
29 |
from huggingface_hub import HfApi
|
30 |
from huggingface_hub import CommitScheduler
|
31 |
+
from huggingface_hub import hf_hub_download
|
32 |
+
import zipfile
|
33 |
+
# Define your repository and file path
|
34 |
+
repo_id = "kgauvin603/rag-10k"
|
35 |
+
file_path = "dataset.zip"
|
36 |
+
|
37 |
+
# Download the file
|
38 |
+
downloaded_file = hf_hub_download(repo_id, file_path)
|
39 |
+
|
40 |
+
# Print the path to the downloaded file
|
41 |
+
print(f"Downloaded file is located at: {downloaded_file}")
|
42 |
+
|
43 |
from langchain_community.embeddings.sentence_transformer import (
|
44 |
SentenceTransformerEmbeddings
|
45 |
)
|
|
|
75 |
import subprocess
|
76 |
|
77 |
# Command to unzip the file
|
78 |
+
#command = "unzip kgauvin603/rag-10k-analysis/Dataset-10k.zip -d dataset"
|
79 |
+
command = "pip install transformers huggingface_hub requests"
|
80 |
# Execute the command
|
81 |
try:
|
82 |
subprocess.run(command, check=True, shell=True)
|
83 |
except subprocess.CalledProcessError as e:
|
84 |
print(f"An error occurred: {e}")
|
85 |
+
|
86 |
+
from huggingface_hub import hf_hub_download
|
87 |
+
import zipfile
|
88 |
+
import os
|
89 |
+
import requests
|
90 |
|
91 |
# Provide pdf_folder_location
|
92 |
+
|
93 |
+
repo_id = "kgauvin603/rag-10k"
|
94 |
+
file_path = "dataset.zip"
|
95 |
+
# Get the URL for the file in the repository
|
96 |
+
file_url = f"https://huggingface.co/{repo_id}/resolve/main/{file_path}"
|
97 |
+
|
98 |
+
# Download the file into memory
|
99 |
+
response = requests.get(file_url)
|
100 |
+
response.raise_for_status() # Ensure the request was successful
|
101 |
+
# Open the zip file in memory
|
102 |
+
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
|
103 |
+
# List the files in the zip archive
|
104 |
+
zip_file_list = zip_ref.namelist()
|
105 |
+
print(f"Files in the zip archive: {zip_file_list}")
|
106 |
+
|
107 |
+
# Extract specific files or work with them directly in memory
|
108 |
+
# For example, reading a specific file
|
109 |
+
with zip_ref.open('dataset/some_file.txt') as file:
|
110 |
+
file_content = file.read()
|
111 |
+
print(file_content.decode('utf-8'))
|
112 |
+
|
113 |
+
# Define the extraction path
|
114 |
+
#extraction_path = "./extracted_files"
|
115 |
+
|
116 |
+
# Create the directory if it doesn't exist
|
117 |
+
#os.makedirs(extraction_path, exist_ok=True)
|
118 |
+
|
119 |
+
# Extract the contents of the zip file
|
120 |
+
#with zipfile.ZipFile(downloaded_file, 'r') as zip_ref:
|
121 |
+
# zip_ref.extractall(extraction_path)
|
122 |
+
|
123 |
+
# List the files in the extraction path
|
124 |
+
#extracted_files = os.listdir(extraction_path)
|
125 |
+
#print(f"Extracted files: {extracted_files}")
|
126 |
|
127 |
# Load the directory to pdf_loader
|
128 |
pdf_loader = PyPDFDirectoryLoader(pdf_folder_location)
|