jaywadekar commited on
Commit
ba88389
·
1 Parent(s): 296dcac

Fixed urls import and reduced HTML boilerplate

Browse files
Files changed (2) hide show
  1. rag.py +254 -7
  2. urls.txt +2 -1
rag.py CHANGED
@@ -10,19 +10,266 @@ from langchain_core.output_parsers import StrOutputParser
10
  from langchain_core.runnables import RunnablePassthrough
11
  from langchain_text_splitters import RecursiveCharacterTextSplitter
12
  from langchain_community.document_loaders import WebBaseLoader
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # Load documentation from urls
15
  def load_docs():
16
-
17
  # Get urls
18
  urlsfile = open("urls.txt")
19
  urls = urlsfile.readlines()
20
  urls = [url.replace("\n","") for url in urls]
21
  urlsfile.close()
22
 
23
- # Load, chunk and index the contents of the blog.
24
- loader = WebBaseLoader(urls)
25
- docs = loader.load()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  # Add source URLs as document names for reference
28
  for i, doc in enumerate(docs):
@@ -76,9 +323,9 @@ def RAG(llm, docs, embeddings):
76
  combined_template = "You are an assistant for question-answering tasks. "\
77
  + "Use the following pieces of retrieved context to answer the question. "\
78
  + "If you don't know the answer, just say that you don't know. "\
79
- + "Use six sentences maximum and keep the answer concise. "\
80
- + "Write the names of the relevant functions from the retrived code. "\
81
- + "Include the reference IDs in square brackets at the end of your answer."\
82
  + template_parts[1]
83
  prompt.messages[0].prompt.template = combined_template
84
 
 
10
  from langchain_core.runnables import RunnablePassthrough
11
  from langchain_text_splitters import RecursiveCharacterTextSplitter
12
  from langchain_community.document_loaders import WebBaseLoader
13
+ from langchain.schema import Document
14
+ import requests
15
+ import json
16
+ import base64
17
+ from bs4 import BeautifulSoup
18
+ import re
19
+
20
+ def github_to_raw(url):
21
+ """Convert GitHub URL to raw content URL"""
22
+ return url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")
23
+
24
+ def load_github_notebook(url):
25
+ """Load Jupyter notebook from GitHub URL using GitHub API"""
26
+ try:
27
+ # Convert GitHub blob URL to API URL
28
+ if "github.com" in url and "/blob/" in url:
29
+ # Extract owner, repo, branch and path from URL
30
+ parts = url.replace("https://github.com/", "").split("/")
31
+ owner = parts[0]
32
+ repo = parts[1]
33
+ branch = parts[3] # usually 'main' or 'master'
34
+ path = "/".join(parts[4:])
35
+
36
+ api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
37
+ else:
38
+ raise ValueError("URL must be a GitHub blob URL")
39
+
40
+ # Fetch notebook content
41
+ response = requests.get(api_url)
42
+ response.raise_for_status()
43
+
44
+ content_data = response.json()
45
+ if content_data.get('encoding') == 'base64':
46
+ notebook_content = base64.b64decode(content_data['content']).decode('utf-8')
47
+ else:
48
+ notebook_content = content_data['content']
49
+
50
+ # Parse notebook JSON
51
+ notebook = json.loads(notebook_content)
52
+
53
+ docs = []
54
+ cell_count = 0
55
+
56
+ # Process each cell
57
+ for cell in notebook.get('cells', []):
58
+ cell_count += 1
59
+ cell_type = cell.get('cell_type', 'unknown')
60
+ source = cell.get('source', [])
61
+
62
+ # Join source lines
63
+ if isinstance(source, list):
64
+ content = ''.join(source)
65
+ else:
66
+ content = str(source)
67
+
68
+ if content.strip(): # Only add non-empty cells
69
+ metadata = {
70
+ 'source': url,
71
+ 'cell_type': cell_type,
72
+ 'cell_number': cell_count,
73
+ 'name': f"{url} - Cell {cell_count} ({cell_type})"
74
+ }
75
+
76
+ # Add cell type prefix for better context
77
+ formatted_content = f"[{cell_type.upper()} CELL {cell_count}]\n{content}"
78
+
79
+ docs.append(Document(page_content=formatted_content, metadata=metadata))
80
+
81
+ return docs
82
+
83
+ except Exception as e:
84
+ print(f"Error loading notebook from {url}: {str(e)}")
85
+ return []
86
+
87
+ def clean_text(text):
88
+ """Clean text content from a webpage"""
89
+ # Remove excessive newlines
90
+ text = re.sub(r'\n{3,}', '\n\n', text)
91
+ # Remove excessive whitespace
92
+ text = re.sub(r'\s{2,}', ' ', text)
93
+ return text.strip()
94
+
95
+ def clean_github_content(html_content):
96
+ """Extract meaningful content from GitHub pages"""
97
+ # Ensure we're working with a BeautifulSoup object
98
+ if isinstance(html_content, str):
99
+ soup = BeautifulSoup(html_content, 'html.parser')
100
+ else:
101
+ soup = html_content
102
+
103
+ # Remove navigation, footer, and other boilerplate
104
+ for element in soup.find_all(['nav', 'footer', 'header']):
105
+ element.decompose()
106
+
107
+ # For README and code files
108
+ readme_content = soup.find('article', class_='markdown-body')
109
+ if readme_content:
110
+ return clean_text(readme_content.get_text())
111
+
112
+ # For code files
113
+ code_content = soup.find('table', class_='highlight')
114
+ if code_content:
115
+ return clean_text(code_content.get_text())
116
+
117
+ # For directory listings
118
+ file_list = soup.find('div', role='grid')
119
+ if file_list:
120
+ return clean_text(file_list.get_text())
121
+
122
+ # Fallback to main content
123
+ main_content = soup.find('main')
124
+ if main_content:
125
+ return clean_text(main_content.get_text())
126
+
127
+ # If no specific content found, get text from body
128
+ body = soup.find('body')
129
+ if body:
130
+ return clean_text(body.get_text())
131
+
132
+ # Final fallback
133
+ return clean_text(soup.get_text())
134
+
135
+ class GitHubLoader(WebBaseLoader):
136
+ """Custom loader for GitHub pages with better content cleaning"""
137
+
138
+ def clean_text(self, text):
139
+ """Clean text content"""
140
+ # Remove excessive newlines and spaces
141
+ text = re.sub(r'\n{2,}', '\n', text)
142
+ text = re.sub(r'\s{2,}', ' ', text)
143
+ # Remove common GitHub boilerplate
144
+ text = re.sub(r'Skip to content|Sign in|Search or jump to|Footer navigation|Terms|Privacy|Security|Status|Docs', '', text)
145
+ return text.strip()
146
+
147
+ def _scrape(self, url: str, *args, **kwargs) -> str:
148
+ """Scrape data from URL and clean it.
149
+
150
+ Args:
151
+ url: The URL to scrape
152
+ *args: Additional positional arguments
153
+ **kwargs: Additional keyword arguments including bs_kwargs
154
+
155
+ Returns:
156
+ str: The cleaned content
157
+ """
158
+ response = requests.get(url)
159
+ response.raise_for_status()
160
+
161
+ # For directory listings (tree URLs), use the API
162
+ if '/tree/' in url:
163
+ # Parse URL components
164
+ parts = url.replace("https://github.com/", "").split("/")
165
+ owner = parts[0]
166
+ repo = parts[1]
167
+ branch = parts[3] # usually 'main' or 'master'
168
+ path = "/".join(parts[4:]) if len(parts) > 4 else ""
169
+
170
+ # Construct API URL
171
+ api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
172
+ api_response = requests.get(api_url)
173
+ api_response.raise_for_status()
174
+
175
+ # Parse directory listing
176
+ contents = api_response.json()
177
+ if isinstance(contents, list):
178
+ # Format directory contents
179
+ files = [f"{item['name']} ({item['type']})" for item in contents]
180
+ return "Directory contents:\n" + "\n".join(files)
181
+ else:
182
+ return f"Error: Unexpected API response for {url}"
183
+
184
+ # For regular files, parse HTML
185
+ soup = BeautifulSoup(response.text, 'html.parser')
186
+
187
+ # For README and markdown files
188
+ readme_content = soup.find('article', class_='markdown-body')
189
+ if readme_content:
190
+ return self.clean_text(readme_content.get_text())
191
+
192
+ # For code files
193
+ code_content = soup.find('table', class_='highlight')
194
+ if code_content:
195
+ return self.clean_text(code_content.get_text())
196
+
197
+ # For other content, get main content
198
+ main_content = soup.find('main')
199
+ if main_content:
200
+ return self.clean_text(main_content.get_text())
201
+
202
+ # Final fallback
203
+ return self.clean_text(soup.get_text())
204
 
205
  # Load documentation from urls
206
  def load_docs():
 
207
  # Get urls
208
  urlsfile = open("urls.txt")
209
  urls = urlsfile.readlines()
210
  urls = [url.replace("\n","") for url in urls]
211
  urlsfile.close()
212
 
213
+ # Load documents from URLs
214
+ docs = []
215
+
216
+ for url in urls:
217
+ url = url.strip()
218
+ if not url:
219
+ continue
220
+
221
+ # Check if URL is a Jupyter notebook
222
+ if url.endswith('.ipynb') and 'github.com' in url and '/blob/' in url:
223
+ print(f"Loading notebook: {url}")
224
+ notebook_docs = load_github_notebook(url)
225
+ docs.extend(notebook_docs)
226
+ # Handle Python and Markdown files using raw content
227
+ elif url.endswith(('.py', '.md')) and 'github.com' in url and '/blob/' in url:
228
+ print(f"Loading raw content: {url}")
229
+ try:
230
+ raw_url = github_to_raw(url)
231
+ loader = WebBaseLoader([raw_url])
232
+ web_docs = loader.load()
233
+ # Preserve original URL in metadata
234
+ for doc in web_docs:
235
+ doc.metadata['source'] = url
236
+ docs.extend(web_docs)
237
+ except Exception as e:
238
+ print(f"Error loading {url}: {str(e)}")
239
+ # Handle directory listings
240
+ elif '/tree/' in url and 'github.com' in url:
241
+ print(f"Loading directory: {url}")
242
+ try:
243
+ # Parse URL components
244
+ parts = url.replace("https://github.com/", "").split("/")
245
+ owner = parts[0]
246
+ repo = parts[1]
247
+ branch = parts[3] # usually 'main' or 'master'
248
+ path = "/".join(parts[4:]) if len(parts) > 4 else ""
249
+
250
+ # Construct API URL
251
+ api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
252
+ response = requests.get(api_url)
253
+ response.raise_for_status()
254
+
255
+ # Parse directory listing
256
+ contents = response.json()
257
+ if isinstance(contents, list):
258
+ # Format directory contents
259
+ content = "Directory contents:\n" + "\n".join([f"{item['name']} ({item['type']})" for item in contents])
260
+ docs.append(Document(page_content=content, metadata={'source': url}))
261
+ else:
262
+ print(f"Error: Unexpected API response for {url}")
263
+ except Exception as e:
264
+ print(f"Error loading directory {url}: {str(e)}")
265
+ else:
266
+ print(f"Loading web page: {url}")
267
+ try:
268
+ loader = GitHubLoader([url]) # Use custom loader
269
+ web_docs = loader.load()
270
+ docs.extend(web_docs)
271
+ except Exception as e:
272
+ print(f"Error loading {url}: {str(e)}")
273
 
274
  # Add source URLs as document names for reference
275
  for i, doc in enumerate(docs):
 
323
  combined_template = "You are an assistant for question-answering tasks. "\
324
  + "Use the following pieces of retrieved context to answer the question. "\
325
  + "If you don't know the answer, just say that you don't know. "\
326
+ + "Try to keep the answer concise if possible. "\
327
+ + "Write the names of the relevant functions from the retrived code and include code snippets to aid the user's understanding. "\
328
+ + "Include the references used in square brackets at the end of your answer."\
329
  + template_parts[1]
330
  prompt.messages[0].prompt.template = combined_template
331
 
urls.txt CHANGED
@@ -16,4 +16,5 @@ https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/template_bank_generato
16
  https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/template_bank_params_O3a_HM.py
17
  https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/triggering_on_cluster.py
18
  https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/triggers_single_detector_HM.py
19
- https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/utils.py
 
 
16
  https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/template_bank_params_O3a_HM.py
17
  https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/triggering_on_cluster.py
18
  https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/triggers_single_detector_HM.py
19
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/utils.py
20
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/Tutorial_notebooks/4.Trig_Coin_on_cluster.ipynb