kgauvin603 commited on
Commit
839ea83
1 Parent(s): fbbf62f

Rename app.py to app_full.py

Browse files
Files changed (1) hide show
  1. app.py → app_full.py +53 -3
app.py → app_full.py RENAMED
@@ -28,6 +28,18 @@ import subprocess
28
  from openai import OpenAI
29
  from huggingface_hub import HfApi
30
  from huggingface_hub import CommitScheduler
 
 
 
 
 
 
 
 
 
 
 
 
31
  from langchain_community.embeddings.sentence_transformer import (
32
  SentenceTransformerEmbeddings
33
  )
@@ -63,16 +75,54 @@ embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')
63
  import subprocess
64
 
65
  # Command to unzip the file
66
- command = "unzip kgauvin603/rag-10k-analysis/Dataset-10k.zip -d dataset"
67
-
68
  # Execute the command
69
  try:
70
  subprocess.run(command, check=True, shell=True)
71
  except subprocess.CalledProcessError as e:
72
  print(f"An error occurred: {e}")
 
 
 
 
 
73
 
74
  # Provide pdf_folder_location
75
- pdf_folder_location = "kgauvin603/rag-10k-analysis/dataset"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  # Load the directory to pdf_loader
78
  pdf_loader = PyPDFDirectoryLoader(pdf_folder_location)
 
28
  from openai import OpenAI
29
  from huggingface_hub import HfApi
30
  from huggingface_hub import CommitScheduler
31
+ from huggingface_hub import hf_hub_download
32
+ import zipfile
33
+ # Define your repository and file path
34
+ repo_id = "kgauvin603/rag-10k"
35
+ file_path = "dataset.zip"
36
+
37
+ # Download the file
38
+ downloaded_file = hf_hub_download(repo_id, file_path)
39
+
40
+ # Print the path to the downloaded file
41
+ print(f"Downloaded file is located at: {downloaded_file}")
42
+
43
  from langchain_community.embeddings.sentence_transformer import (
44
  SentenceTransformerEmbeddings
45
  )
 
75
  import subprocess
76
 
77
  # Command to unzip the file
78
+ #command = "unzip kgauvin603/rag-10k-analysis/Dataset-10k.zip -d dataset"
79
+ command = "pip install transformers huggingface_hub requests"
80
  # Execute the command
81
  try:
82
  subprocess.run(command, check=True, shell=True)
83
  except subprocess.CalledProcessError as e:
84
  print(f"An error occurred: {e}")
85
+
86
+ from huggingface_hub import hf_hub_download
87
+ import zipfile
88
+ import os
89
+ import requests
90
 
91
  # Provide pdf_folder_location
92
+
93
+ repo_id = "kgauvin603/rag-10k"
94
+ file_path = "dataset.zip"
95
+ # Get the URL for the file in the repository
96
+ file_url = f"https://huggingface.co/{repo_id}/resolve/main/{file_path}"
97
+
98
+ # Download the file into memory
99
+ response = requests.get(file_url)
100
+ response.raise_for_status() # Ensure the request was successful
101
+ # Open the zip file in memory
102
+ with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
103
+ # List the files in the zip archive
104
+ zip_file_list = zip_ref.namelist()
105
+ print(f"Files in the zip archive: {zip_file_list}")
106
+
107
+ # Extract specific files or work with them directly in memory
108
+ # For example, reading a specific file
109
+ with zip_ref.open('dataset/some_file.txt') as file:
110
+ file_content = file.read()
111
+ print(file_content.decode('utf-8'))
112
+
113
+ # Define the extraction path
114
+ #extraction_path = "./extracted_files"
115
+
116
+ # Create the directory if it doesn't exist
117
+ #os.makedirs(extraction_path, exist_ok=True)
118
+
119
+ # Extract the contents of the zip file
120
+ #with zipfile.ZipFile(downloaded_file, 'r') as zip_ref:
121
+ # zip_ref.extractall(extraction_path)
122
+
123
+ # List the files in the extraction path
124
+ #extracted_files = os.listdir(extraction_path)
125
+ #print(f"Extracted files: {extracted_files}")
126
 
127
  # Load the directory to pdf_loader
128
  pdf_loader = PyPDFDirectoryLoader(pdf_folder_location)