Spaces:
Running
Running
Commit
·
ba88389
1
Parent(s):
296dcac
Fixed urls import and reduced HTML boilerplate
Browse files
rag.py
CHANGED
@@ -10,19 +10,266 @@ from langchain_core.output_parsers import StrOutputParser
|
|
10 |
from langchain_core.runnables import RunnablePassthrough
|
11 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
12 |
from langchain_community.document_loaders import WebBaseLoader
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# Load documentation from urls
|
15 |
def load_docs():
|
16 |
-
|
17 |
# Get urls
|
18 |
urlsfile = open("urls.txt")
|
19 |
urls = urlsfile.readlines()
|
20 |
urls = [url.replace("\n","") for url in urls]
|
21 |
urlsfile.close()
|
22 |
|
23 |
-
# Load
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
# Add source URLs as document names for reference
|
28 |
for i, doc in enumerate(docs):
|
@@ -76,9 +323,9 @@ def RAG(llm, docs, embeddings):
|
|
76 |
combined_template = "You are an assistant for question-answering tasks. "\
|
77 |
+ "Use the following pieces of retrieved context to answer the question. "\
|
78 |
+ "If you don't know the answer, just say that you don't know. "\
|
79 |
-
+ "
|
80 |
-
+ "Write the names of the relevant functions from the retrived code. "\
|
81 |
-
+ "Include the
|
82 |
+ template_parts[1]
|
83 |
prompt.messages[0].prompt.template = combined_template
|
84 |
|
|
|
10 |
from langchain_core.runnables import RunnablePassthrough
|
11 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
12 |
from langchain_community.document_loaders import WebBaseLoader
|
13 |
+
from langchain.schema import Document
|
14 |
+
import requests
|
15 |
+
import json
|
16 |
+
import base64
|
17 |
+
from bs4 import BeautifulSoup
|
18 |
+
import re
|
19 |
+
|
20 |
+
def github_to_raw(url):
|
21 |
+
"""Convert GitHub URL to raw content URL"""
|
22 |
+
return url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")
|
23 |
+
|
24 |
+
def load_github_notebook(url):
|
25 |
+
"""Load Jupyter notebook from GitHub URL using GitHub API"""
|
26 |
+
try:
|
27 |
+
# Convert GitHub blob URL to API URL
|
28 |
+
if "github.com" in url and "/blob/" in url:
|
29 |
+
# Extract owner, repo, branch and path from URL
|
30 |
+
parts = url.replace("https://github.com/", "").split("/")
|
31 |
+
owner = parts[0]
|
32 |
+
repo = parts[1]
|
33 |
+
branch = parts[3] # usually 'main' or 'master'
|
34 |
+
path = "/".join(parts[4:])
|
35 |
+
|
36 |
+
api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
|
37 |
+
else:
|
38 |
+
raise ValueError("URL must be a GitHub blob URL")
|
39 |
+
|
40 |
+
# Fetch notebook content
|
41 |
+
response = requests.get(api_url)
|
42 |
+
response.raise_for_status()
|
43 |
+
|
44 |
+
content_data = response.json()
|
45 |
+
if content_data.get('encoding') == 'base64':
|
46 |
+
notebook_content = base64.b64decode(content_data['content']).decode('utf-8')
|
47 |
+
else:
|
48 |
+
notebook_content = content_data['content']
|
49 |
+
|
50 |
+
# Parse notebook JSON
|
51 |
+
notebook = json.loads(notebook_content)
|
52 |
+
|
53 |
+
docs = []
|
54 |
+
cell_count = 0
|
55 |
+
|
56 |
+
# Process each cell
|
57 |
+
for cell in notebook.get('cells', []):
|
58 |
+
cell_count += 1
|
59 |
+
cell_type = cell.get('cell_type', 'unknown')
|
60 |
+
source = cell.get('source', [])
|
61 |
+
|
62 |
+
# Join source lines
|
63 |
+
if isinstance(source, list):
|
64 |
+
content = ''.join(source)
|
65 |
+
else:
|
66 |
+
content = str(source)
|
67 |
+
|
68 |
+
if content.strip(): # Only add non-empty cells
|
69 |
+
metadata = {
|
70 |
+
'source': url,
|
71 |
+
'cell_type': cell_type,
|
72 |
+
'cell_number': cell_count,
|
73 |
+
'name': f"{url} - Cell {cell_count} ({cell_type})"
|
74 |
+
}
|
75 |
+
|
76 |
+
# Add cell type prefix for better context
|
77 |
+
formatted_content = f"[{cell_type.upper()} CELL {cell_count}]\n{content}"
|
78 |
+
|
79 |
+
docs.append(Document(page_content=formatted_content, metadata=metadata))
|
80 |
+
|
81 |
+
return docs
|
82 |
+
|
83 |
+
except Exception as e:
|
84 |
+
print(f"Error loading notebook from {url}: {str(e)}")
|
85 |
+
return []
|
86 |
+
|
87 |
+
def clean_text(text):
|
88 |
+
"""Clean text content from a webpage"""
|
89 |
+
# Remove excessive newlines
|
90 |
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
91 |
+
# Remove excessive whitespace
|
92 |
+
text = re.sub(r'\s{2,}', ' ', text)
|
93 |
+
return text.strip()
|
94 |
+
|
95 |
+
def clean_github_content(html_content):
|
96 |
+
"""Extract meaningful content from GitHub pages"""
|
97 |
+
# Ensure we're working with a BeautifulSoup object
|
98 |
+
if isinstance(html_content, str):
|
99 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
100 |
+
else:
|
101 |
+
soup = html_content
|
102 |
+
|
103 |
+
# Remove navigation, footer, and other boilerplate
|
104 |
+
for element in soup.find_all(['nav', 'footer', 'header']):
|
105 |
+
element.decompose()
|
106 |
+
|
107 |
+
# For README and code files
|
108 |
+
readme_content = soup.find('article', class_='markdown-body')
|
109 |
+
if readme_content:
|
110 |
+
return clean_text(readme_content.get_text())
|
111 |
+
|
112 |
+
# For code files
|
113 |
+
code_content = soup.find('table', class_='highlight')
|
114 |
+
if code_content:
|
115 |
+
return clean_text(code_content.get_text())
|
116 |
+
|
117 |
+
# For directory listings
|
118 |
+
file_list = soup.find('div', role='grid')
|
119 |
+
if file_list:
|
120 |
+
return clean_text(file_list.get_text())
|
121 |
+
|
122 |
+
# Fallback to main content
|
123 |
+
main_content = soup.find('main')
|
124 |
+
if main_content:
|
125 |
+
return clean_text(main_content.get_text())
|
126 |
+
|
127 |
+
# If no specific content found, get text from body
|
128 |
+
body = soup.find('body')
|
129 |
+
if body:
|
130 |
+
return clean_text(body.get_text())
|
131 |
+
|
132 |
+
# Final fallback
|
133 |
+
return clean_text(soup.get_text())
|
134 |
+
|
135 |
+
class GitHubLoader(WebBaseLoader):
|
136 |
+
"""Custom loader for GitHub pages with better content cleaning"""
|
137 |
+
|
138 |
+
def clean_text(self, text):
|
139 |
+
"""Clean text content"""
|
140 |
+
# Remove excessive newlines and spaces
|
141 |
+
text = re.sub(r'\n{2,}', '\n', text)
|
142 |
+
text = re.sub(r'\s{2,}', ' ', text)
|
143 |
+
# Remove common GitHub boilerplate
|
144 |
+
text = re.sub(r'Skip to content|Sign in|Search or jump to|Footer navigation|Terms|Privacy|Security|Status|Docs', '', text)
|
145 |
+
return text.strip()
|
146 |
+
|
147 |
+
def _scrape(self, url: str, *args, **kwargs) -> str:
|
148 |
+
"""Scrape data from URL and clean it.
|
149 |
+
|
150 |
+
Args:
|
151 |
+
url: The URL to scrape
|
152 |
+
*args: Additional positional arguments
|
153 |
+
**kwargs: Additional keyword arguments including bs_kwargs
|
154 |
+
|
155 |
+
Returns:
|
156 |
+
str: The cleaned content
|
157 |
+
"""
|
158 |
+
response = requests.get(url)
|
159 |
+
response.raise_for_status()
|
160 |
+
|
161 |
+
# For directory listings (tree URLs), use the API
|
162 |
+
if '/tree/' in url:
|
163 |
+
# Parse URL components
|
164 |
+
parts = url.replace("https://github.com/", "").split("/")
|
165 |
+
owner = parts[0]
|
166 |
+
repo = parts[1]
|
167 |
+
branch = parts[3] # usually 'main' or 'master'
|
168 |
+
path = "/".join(parts[4:]) if len(parts) > 4 else ""
|
169 |
+
|
170 |
+
# Construct API URL
|
171 |
+
api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
|
172 |
+
api_response = requests.get(api_url)
|
173 |
+
api_response.raise_for_status()
|
174 |
+
|
175 |
+
# Parse directory listing
|
176 |
+
contents = api_response.json()
|
177 |
+
if isinstance(contents, list):
|
178 |
+
# Format directory contents
|
179 |
+
files = [f"{item['name']} ({item['type']})" for item in contents]
|
180 |
+
return "Directory contents:\n" + "\n".join(files)
|
181 |
+
else:
|
182 |
+
return f"Error: Unexpected API response for {url}"
|
183 |
+
|
184 |
+
# For regular files, parse HTML
|
185 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
186 |
+
|
187 |
+
# For README and markdown files
|
188 |
+
readme_content = soup.find('article', class_='markdown-body')
|
189 |
+
if readme_content:
|
190 |
+
return self.clean_text(readme_content.get_text())
|
191 |
+
|
192 |
+
# For code files
|
193 |
+
code_content = soup.find('table', class_='highlight')
|
194 |
+
if code_content:
|
195 |
+
return self.clean_text(code_content.get_text())
|
196 |
+
|
197 |
+
# For other content, get main content
|
198 |
+
main_content = soup.find('main')
|
199 |
+
if main_content:
|
200 |
+
return self.clean_text(main_content.get_text())
|
201 |
+
|
202 |
+
# Final fallback
|
203 |
+
return self.clean_text(soup.get_text())
|
204 |
|
205 |
# Load documentation from urls
|
206 |
def load_docs():
|
|
|
207 |
# Get urls
|
208 |
urlsfile = open("urls.txt")
|
209 |
urls = urlsfile.readlines()
|
210 |
urls = [url.replace("\n","") for url in urls]
|
211 |
urlsfile.close()
|
212 |
|
213 |
+
# Load documents from URLs
|
214 |
+
docs = []
|
215 |
+
|
216 |
+
for url in urls:
|
217 |
+
url = url.strip()
|
218 |
+
if not url:
|
219 |
+
continue
|
220 |
+
|
221 |
+
# Check if URL is a Jupyter notebook
|
222 |
+
if url.endswith('.ipynb') and 'github.com' in url and '/blob/' in url:
|
223 |
+
print(f"Loading notebook: {url}")
|
224 |
+
notebook_docs = load_github_notebook(url)
|
225 |
+
docs.extend(notebook_docs)
|
226 |
+
# Handle Python and Markdown files using raw content
|
227 |
+
elif url.endswith(('.py', '.md')) and 'github.com' in url and '/blob/' in url:
|
228 |
+
print(f"Loading raw content: {url}")
|
229 |
+
try:
|
230 |
+
raw_url = github_to_raw(url)
|
231 |
+
loader = WebBaseLoader([raw_url])
|
232 |
+
web_docs = loader.load()
|
233 |
+
# Preserve original URL in metadata
|
234 |
+
for doc in web_docs:
|
235 |
+
doc.metadata['source'] = url
|
236 |
+
docs.extend(web_docs)
|
237 |
+
except Exception as e:
|
238 |
+
print(f"Error loading {url}: {str(e)}")
|
239 |
+
# Handle directory listings
|
240 |
+
elif '/tree/' in url and 'github.com' in url:
|
241 |
+
print(f"Loading directory: {url}")
|
242 |
+
try:
|
243 |
+
# Parse URL components
|
244 |
+
parts = url.replace("https://github.com/", "").split("/")
|
245 |
+
owner = parts[0]
|
246 |
+
repo = parts[1]
|
247 |
+
branch = parts[3] # usually 'main' or 'master'
|
248 |
+
path = "/".join(parts[4:]) if len(parts) > 4 else ""
|
249 |
+
|
250 |
+
# Construct API URL
|
251 |
+
api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
|
252 |
+
response = requests.get(api_url)
|
253 |
+
response.raise_for_status()
|
254 |
+
|
255 |
+
# Parse directory listing
|
256 |
+
contents = response.json()
|
257 |
+
if isinstance(contents, list):
|
258 |
+
# Format directory contents
|
259 |
+
content = "Directory contents:\n" + "\n".join([f"{item['name']} ({item['type']})" for item in contents])
|
260 |
+
docs.append(Document(page_content=content, metadata={'source': url}))
|
261 |
+
else:
|
262 |
+
print(f"Error: Unexpected API response for {url}")
|
263 |
+
except Exception as e:
|
264 |
+
print(f"Error loading directory {url}: {str(e)}")
|
265 |
+
else:
|
266 |
+
print(f"Loading web page: {url}")
|
267 |
+
try:
|
268 |
+
loader = GitHubLoader([url]) # Use custom loader
|
269 |
+
web_docs = loader.load()
|
270 |
+
docs.extend(web_docs)
|
271 |
+
except Exception as e:
|
272 |
+
print(f"Error loading {url}: {str(e)}")
|
273 |
|
274 |
# Add source URLs as document names for reference
|
275 |
for i, doc in enumerate(docs):
|
|
|
323 |
combined_template = "You are an assistant for question-answering tasks. "\
|
324 |
+ "Use the following pieces of retrieved context to answer the question. "\
|
325 |
+ "If you don't know the answer, just say that you don't know. "\
|
326 |
+
+ "Try to keep the answer concise if possible. "\
|
327 |
+
+ "Write the names of the relevant functions from the retrived code and include code snippets to aid the user's understanding. "\
|
328 |
+
+ "Include the references used in square brackets at the end of your answer."\
|
329 |
+ template_parts[1]
|
330 |
prompt.messages[0].prompt.template = combined_template
|
331 |
|
urls.txt
CHANGED
@@ -16,4 +16,5 @@ https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/template_bank_generato
|
|
16 |
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/template_bank_params_O3a_HM.py
|
17 |
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/triggering_on_cluster.py
|
18 |
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/triggers_single_detector_HM.py
|
19 |
-
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/utils.py
|
|
|
|
16 |
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/template_bank_params_O3a_HM.py
|
17 |
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/triggering_on_cluster.py
|
18 |
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/triggers_single_detector_HM.py
|
19 |
+
https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/utils.py
|
20 |
+
https://github.com/JayWadekar/gwIAS-HM/blob/main/Tutorial_notebooks/4.Trig_Coin_on_cluster.ipynb
|