Petermoyano commited on
Commit
bc7569e
·
1 Parent(s): 47f0ba6

Add scraper and update RAG

Browse files
Files changed (2) hide show
  1. app.py +58 -15
  2. scraper.py +81 -0
app.py CHANGED
@@ -7,40 +7,39 @@ from llama_index.llms.openai import OpenAI
7
  from llama_index.vector_stores.pinecone import PineconeVectorStore
8
  from llama_index.core import VectorStoreIndex, StorageContext
9
  from dotenv import load_dotenv
10
- from pinecone import Pinecone # Correct import
 
11
 
12
  load_dotenv()
13
 
14
  app = Flask(__name__)
15
  app.debug = True
16
 
17
- # Configure logging
18
  logging.basicConfig(level=logging.DEBUG)
19
 
20
- # Set up OpenAI API key
21
  openai.api_key = os.getenv('OPENAI_API_KEY')
22
 
23
- # Initialize Pinecone
24
  pc = Pinecone(
25
  api_key=os.getenv('PINECONE_API_KEY')
26
  )
27
 
28
- # Name of your existing Pinecone index
29
  PINECONE_INDEX_NAME = os.getenv('PINECONE_INDEX')
30
 
31
- # Initialize Pinecone index
32
  pinecone_index = pc.Index(PINECONE_INDEX_NAME)
33
 
 
 
 
 
34
  # Set up LlamaIndex global settings
35
  Settings.llm = OpenAI(
36
- model=os.getenv('OPENAI_MODEL', 'gpt-3.5-turbo'), # Default to 'gpt-3.5-turbo' if not specified
37
  temperature=0
38
  )
39
 
40
- # Set up Pinecone Vector Store
41
  vector_store = PineconeVectorStore(
42
  pinecone_index=pinecone_index,
43
- namespace=None # Specify a namespace if used during ingestion
44
  )
45
 
46
  # Create Storage Context with the Vector Store
@@ -56,7 +55,7 @@ index = VectorStoreIndex.from_vector_store(
56
  def predict():
57
  try:
58
  data = request.json
59
- app.logger.debug(f"Received data: {data}")
60
 
61
  if not data:
62
  app.logger.error("No data provided in the request.")
@@ -68,16 +67,60 @@ def predict():
68
  app.logger.error("No query provided in the request.")
69
  return jsonify({'error': 'No query provided.'}), 400
70
 
71
- # Perform the query using LlamaIndex
72
- response = index.as_query_engine().query(user_query)
73
- # app.logger.debug(f"Generated response: {response}")
74
 
75
- return jsonify({'response': str(response)})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  except Exception as e:
78
- app.logger.error(f"Error processing request: {e}")
79
  return jsonify({"error": "An error occurred while processing the request"}), 500
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  if __name__ == '__main__':
82
  from os import environ
83
  app.run(host='0.0.0.0', port=int(environ.get('PORT', 7860)))
 
7
  from llama_index.vector_stores.pinecone import PineconeVectorStore
8
  from llama_index.core import VectorStoreIndex, StorageContext
9
  from dotenv import load_dotenv
10
+ from pinecone import Pinecone
11
+ from llama_index.embeddings.openai import OpenAIEmbedding
12
 
13
  load_dotenv()
14
 
15
  app = Flask(__name__)
16
  app.debug = True
17
 
 
18
  logging.basicConfig(level=logging.DEBUG)
19
 
 
20
  openai.api_key = os.getenv('OPENAI_API_KEY')
21
 
 
22
  pc = Pinecone(
23
  api_key=os.getenv('PINECONE_API_KEY')
24
  )
25
 
 
26
  PINECONE_INDEX_NAME = os.getenv('PINECONE_INDEX')
27
 
 
28
  pinecone_index = pc.Index(PINECONE_INDEX_NAME)
29
 
30
+ # After initializing the Pinecone index
31
+ stats = pinecone_index.describe_index_stats()
32
+ app.logger.debug(f"Pinecone index stats: {stats}")
33
+
34
  # Set up LlamaIndex global settings
35
  Settings.llm = OpenAI(
36
+ model=os.getenv('OPENAI_MODEL', 'gpt-3.5-turbo'),
37
  temperature=0
38
  )
39
 
 
40
  vector_store = PineconeVectorStore(
41
  pinecone_index=pinecone_index,
42
+ namespace="ai"
43
  )
44
 
45
  # Create Storage Context with the Vector Store
 
55
  def predict():
56
  try:
57
  data = request.json
58
+ app.logger.debug(f"Received data: {data}") # data => {'query': 'What is LangChain?'}
59
 
60
  if not data:
61
  app.logger.error("No data provided in the request.")
 
67
  app.logger.error("No query provided in the request.")
68
  return jsonify({'error': 'No query provided.'}), 400
69
 
70
+ # Log Pinecone query details
71
+ app.logger.debug(f"Querying Pinecone index: {PINECONE_INDEX_NAME}")
72
+ app.logger.debug(f"Query: {user_query}")
73
 
74
+ # Perform the query using LlamaIndex
75
+ query_engine = index.as_query_engine(similarity_top_k=5)
76
+ app.logger.debug(f"Query engine: {query_engine}")
77
+
78
+ response = query_engine.query(user_query)
79
+ app.logger.debug(f"Raw response object: {response}")
80
+ app.logger.debug(f"Response type: {type(response)}")
81
+
82
+ if hasattr(response, 'source_nodes'):
83
+ app.logger.debug(f"Number of source nodes: {len(response.source_nodes)}")
84
+ for i, node in enumerate(response.source_nodes):
85
+ app.logger.debug(f"Source node {i}: {node.node.text[:100]}...") # Log first 100 chars of each source node
86
+ else:
87
+ app.logger.warning("No source nodes found in the response")
88
+
89
+ if hasattr(response, 'response'):
90
+ response_text = response.response
91
+ else:
92
+ response_text = str(response)
93
+
94
+ app.logger.debug(f"Response text: {response_text}")
95
+
96
+ return jsonify({'response': response_text})
97
 
98
  except Exception as e:
99
+ app.logger.error(f"Error processing request: {e}", exc_info=True)
100
  return jsonify({"error": "An error occurred while processing the request"}), 500
101
 
102
+ @app.route('/empty-datastore', methods=['DELETE'])
103
+ def empty_datastore():
104
+ try:
105
+ # Attempt to delete all vectors in the default namespace
106
+ delete_response = pinecone_index.delete(delete_all=True, namespace="")
107
+ app.logger.debug(f"Delete response: {delete_response}")
108
+
109
+ # Verify the index is empty
110
+ stats = pinecone_index.describe_index_stats()
111
+ app.logger.debug(f"Index stats after deletion: {stats}")
112
+
113
+ if stats['total_vector_count'] == 0:
114
+ app.logger.info("Datastore emptied successfully.")
115
+ return jsonify({'message': 'Datastore emptied successfully'}), 200
116
+ else:
117
+ app.logger.warning("Datastore not fully emptied.")
118
+ return jsonify({'message': 'Datastore not fully emptied'}), 500
119
+
120
+ except Exception as e:
121
+ app.logger.error(f"Error emptying datastore: {e}")
122
+ return jsonify({'error': f'An error occurred while emptying the datastore: {str(e)}'}), 500
123
+
124
  if __name__ == '__main__':
125
  from os import environ
126
  app.run(host='0.0.0.0', port=int(environ.get('PORT', 7860)))
scraper.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import os
4
+ from urllib.parse import urljoin, urlparse, urldefrag
5
+ import time
6
+ import logging
7
+
8
+ # Set up logging for error handling
9
+ logging.basicConfig(filename='scrape_errors.log', level=logging.ERROR)
10
+
11
+ def scrape_docs(base_url, save_dir, delay=1):
12
+ if not os.path.exists(save_dir):
13
+ os.makedirs(save_dir)
14
+
15
+ visited = set()
16
+
17
+ def normalize_url(url):
18
+ # Remove fragments and query parameters, and normalize slashes
19
+ url, _ = urldefrag(url) # Remove the fragment
20
+ parsed_url = urlparse(url)
21
+ normalized_url = parsed_url._replace(query="").geturl().rstrip('/')
22
+ return normalized_url
23
+
24
+ def scrape_page(url):
25
+ normalized_url = normalize_url(url)
26
+
27
+ if normalized_url in visited:
28
+ return
29
+ visited.add(normalized_url)
30
+
31
+ try:
32
+ response = requests.get(normalized_url)
33
+ if response.status_code != 200:
34
+ logging.error(f"Failed to retrieve {normalized_url}, status code: {response.status_code}")
35
+ return
36
+
37
+ soup = BeautifulSoup(response.text, 'html.parser')
38
+
39
+ # Save the page content
40
+ parsed_url = urlparse(normalized_url)
41
+ relative_path = parsed_url.path.lstrip('/')
42
+ file_path = os.path.join(save_dir, relative_path)
43
+
44
+ # Ensure the directory exists
45
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
46
+
47
+ # Determine the file name: append 'index.html' if it's a directory
48
+ if parsed_url.path.endswith('/') or not os.path.basename(file_path):
49
+ file_path = os.path.join(file_path, 'index.html')
50
+ elif not file_path.endswith('.html'):
51
+ file_path += '.html'
52
+
53
+ with open(file_path, 'w', encoding='utf-8') as f:
54
+ f.write(response.text)
55
+
56
+ print(f"Scraped: {normalized_url}")
57
+
58
+ # Find all links on the page
59
+ for link in soup.find_all('a', href=True):
60
+ href = link['href']
61
+ full_url = urljoin(normalized_url, href)
62
+
63
+ # Only follow links within the base URL
64
+ if full_url.startswith(base_url):
65
+ scrape_page(full_url)
66
+
67
+ # Respect server rate limits
68
+ time.sleep(delay)
69
+
70
+ except Exception as e:
71
+ logging.error(f"Failed to scrape {normalized_url}: {e}")
72
+ print(f"Failed to scrape {normalized_url}, see log for details.")
73
+
74
+ scrape_page(base_url)
75
+
76
+ if __name__ == "__main__":
77
+ base_url = "https://docs.llamaindex.ai/en/stable/api_reference/"
78
+ save_dir = "llamaindex_docs"
79
+
80
+ scrape_docs(base_url, save_dir, delay=1)
81
+ print("Scraping completed.")