Galatea007 commited on
Commit
b8918cc
·
verified ·
1 Parent(s): f252750

Upload 2 files

Browse files
Files changed (2) hide show
  1. AI_Risk_app.py +182 -0
  2. requirements.txt +64 -0
AI_Risk_app.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import sys
4
+ from langchain_community.embeddings import OpenAIEmbeddings
5
+ from dotenv import load_dotenv
6
+
7
+ def install_packages():
8
+ # List of packages to install in separate batches
9
+ packages_batches = [
10
+ ["langchain", "langchain-openai", "langchain_core", "langchain-community", "langchainhub", "openai", "langchain-qdrant"],
11
+ ["qdrant-client", "pymupdf", "pandas"],
12
+ ["llama-index", "--no-cache-dir"],
13
+ ["llama-parse", "PyPDF2", "tiktoken"],
14
+ ["langchain-text-splitters"],
15
+ ["PyPDF2"],
16
+ ["scikit-learn"]
17
+ ]
18
+
19
+ # Install each batch of packages
20
+ for package_list in packages_batches:
21
+ try:
22
+ print(f"Installing: {' '.join(package_list)}")
23
+ subprocess.check_call([sys.executable, "-m", "pip", "install"] + package_list)
24
+ print(f"Successfully installed: {' '.join(package_list)}\n")
25
+ except subprocess.CalledProcessError as e:
26
+ print(f"Failed to install {package_list}: {e}\n")
27
+
28
+ # Call the function to install the packages
29
+ if __name__ == "__main__":
30
+ install_packages()
31
+
32
+ # Load environment variables from .env file
33
+ load_dotenv()
34
+
35
+ # Get the OpenAI API key from the environment variables
36
+ api_key = os.getenv("OPENAI_API_KEY")
37
+
38
+ # Check if the API key is loaded
39
+ if not api_key:
40
+ print("OpenAI API key not found. Please ensure it is set in the .env file.")
41
+ else:
42
+ print("OpenAI API key loaded successfully.")
43
+
44
+
45
+ import nest_asyncio
46
+ nest_asyncio.apply()
47
+
48
+ # Function to extract text from PDF URLs
49
+ import re
50
+ import requests
51
+ from PyPDF2 import PdfReader
52
+ from io import BytesIO
53
+
54
+ # URLs for the two PDFs
55
+ pdf_urls = [
56
+ "https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf",
57
+ "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf"
58
+ ]
59
+
60
+ def extract_text_from_pdf(url):
61
+ response = requests.get(url)
62
+ pdf_file = BytesIO(response.content)
63
+ reader = PdfReader(pdf_file)
64
+
65
+ pdf_text = ""
66
+ for page in reader.pages:
67
+ pdf_text += page.extract_text()
68
+
69
+ cleaned_text = pdf_text.replace("\n", " ").replace("\r", " ").strip()
70
+ cleaned_text = " ".join(cleaned_text.split())
71
+
72
+ sentences = re.split(r'(?<=[.!?]) +', cleaned_text)
73
+ return sentences
74
+
75
+ # Extract text from both PDFs
76
+ sentences_list = []
77
+ for url in pdf_urls:
78
+ sentences = extract_text_from_pdf(url)
79
+ sentences_list.append(sentences)
80
+ print(f"Extracted {len(sentences)} sentences from {url}")
81
+
82
+
83
+
84
+ # Semantic chunking
85
+ from langchain.embeddings.openai import OpenAIEmbeddings
86
+ from sklearn.metrics.pairwise import cosine_similarity
87
+ import tiktoken
88
+ import numpy as np
89
+
90
+ embedding_model = OpenAIEmbeddings()
91
+ flat_sentences = [sentence for sublist in sentences_list for sentence in sublist]
92
+ embeddings = embedding_model.embed_documents(flat_sentences)
93
+
94
+ def greedy_chunk_sentences(sentences, sentence_embeddings, max_chunk_size=1000, similarity_threshold=0.75):
95
+ chunks = []
96
+ current_chunk = []
97
+ current_chunk_tokens = 0
98
+ encoder = tiktoken.get_encoding("cl100k_base")
99
+
100
+ for i, sentence in enumerate(sentences):
101
+ sentence_tokens = len(encoder.encode(sentence))
102
+
103
+ if current_chunk:
104
+ similarity = cosine_similarity([sentence_embeddings[i]], [sentence_embeddings[i - 1]])[0][0]
105
+ if similarity < similarity_threshold or current_chunk_tokens + sentence_tokens > max_chunk_size:
106
+ chunks.append(" ".join(current_chunk))
107
+ current_chunk = []
108
+ current_chunk_tokens = 0
109
+
110
+ current_chunk.append(sentence)
111
+ current_chunk_tokens += sentence_tokens
112
+
113
+ if current_chunk:
114
+ chunks.append(" ".join(current_chunk))
115
+
116
+ return chunks
117
+
118
+ # Perform greedy chunking
119
+ semantic_chunks = greedy_chunk_sentences(sentences_list[0], embeddings)
120
+
121
+
122
+ # Qdrant setup for storing chunks
123
+ from qdrant_client import QdrantClient
124
+ from qdrant_client.http.models import Distance, VectorParams
125
+ from langchain_qdrant import QdrantVectorStore
126
+ from langchain.schema import Document
127
+ import uuid
128
+
129
+ LOCATION = ":memory:"
130
+ COLLECTION_NAME = "Semantic_Chunking"
131
+
132
+ qdrant_client = QdrantClient(LOCATION)
133
+
134
+ qdrant_client.create_collection(
135
+ collection_name=COLLECTION_NAME,
136
+ vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
137
+ )
138
+
139
+ qdrant_vector_store = QdrantVectorStore(
140
+ client=qdrant_client,
141
+ collection_name=COLLECTION_NAME,
142
+ embedding=embedding_model,
143
+ )
144
+
145
+ documents = [Document(page_content=chunk, metadata={"source": "generated"}, id=str(uuid.uuid4())) for chunk in semantic_chunks]
146
+ qdrant_vector_store.add_documents(documents)
147
+
148
+ # Retrieve data from Qdrant
149
+ retriever = qdrant_vector_store.as_retriever()
150
+
151
+ # Define prompt and execute RAG chain
152
+ from langchain.prompts import ChatPromptTemplate
153
+ from operator import itemgetter
154
+ from langchain_openai import ChatOpenAI
155
+ from langchain_core.output_parsers import StrOutputParser
156
+ from langchain_core.runnables import RunnablePassthrough
157
+
158
+ template = """
159
+ ### You are a helpful assistant. Use the available context to answer the question. If you can't answer the question, say you don't know.
160
+
161
+ Question:
162
+ {question}
163
+
164
+ Context:
165
+ {context}
166
+ """
167
+
168
+ prompt = ChatPromptTemplate.from_template(template)
169
+
170
+ primary_qa_llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
171
+
172
+ retrieval_augmented_qa_chain = (
173
+ {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
174
+ | RunnablePassthrough.assign(context=itemgetter("context"))
175
+ | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
176
+ )
177
+
178
+ # Query the RAG chain
179
+ question = "What are the top AI risks and how to best manage them?"
180
+ result = retrieval_augmented_qa_chain.invoke({"question": question})
181
+
182
+ print(result["response"].content)
requirements.txt ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ chainlit==1.2.0
3
+ openai==1.47.0
4
+ langchain-openai>=0.1.6,<0.2.0 # Updated version range for compatibility
5
+ langchain-core>=0.1.46,<0.2.0 # Matches langchain-openai version requirements
6
+ langchain-community
7
+ langchainhub
8
+ langchain-qdrant
9
+ streamlit
10
+ python-dotenv
11
+ langchain
12
+ openai
13
+ streamlit
14
+ python-dotenv
15
+
16
+
17
+
18
+ # Llama-index and related libraries
19
+ llama-index==0.11.11
20
+ llama-index-agent-openai==0.3.4
21
+ llama-index-cli==0.3.1
22
+ llama-index-core==0.11.11
23
+ llama-index-embeddings-openai==0.2.5
24
+ llama-index-indices-managed-llama-cloud==0.3.1
25
+ llama-index-legacy==0.9.48.post3
26
+ llama-index-llms-openai==0.2.9
27
+ llama-index-multi-modal-llms-openai==0.2.1
28
+ llama-index-program-openai==0.2.0
29
+ llama-index-question-gen-openai==0.2.0
30
+ llama-index-readers-file==0.2.2
31
+ llama-index-readers-llama-parse==0.3.0
32
+ llama-parse==0.5.6
33
+
34
+ # Llama Cloud
35
+ llama-cloud==0.0.17
36
+
37
+ # Additional libraries
38
+ qdrant-client # Ensure no conflicts with qdrant
39
+ pymupdf # Ensure compatibility
40
+ pandas==2.2.3 # Latest stable version
41
+ scikit-learn==1.5.2 # Latest available version
42
+ PyPDF2==3.0.1 # Fixed to avoid version conflict
43
+
44
+ # Tiktoken version (Updated to match langchain-openai's requirements)
45
+ tiktoken>=0.5.2,<0.6.0
46
+
47
+ # Dependency version conflict fix
48
+ packaging>=23.1,<24.0 # Pin packaging to avoid conflicts
49
+
50
+ # Networkx and Mypy (you can pin these to avoid pip backtracking)
51
+ networkx==3.2 # Pinned for compatibility
52
+ mypy-extensions==0.4.3
53
+
54
+ # Other dependencies
55
+ SQLAlchemy>=1.4.49 # Ensure compatibility
56
+ aiohttp>=3.8.6 # Compatible version
57
+ dataclasses-json>=0.6.7 # Ensure no conflicts with other libraries
58
+ fsspec>=2023.5.0 # Latest version for compatibility
59
+ nltk>3.8.1 # Latest available version
60
+ requests>=2.31.0 # Pinned version
61
+ tqdm>=4.66.1 # Ensure compatibility
62
+ jsonpointer==2.4
63
+ importlib-metadata>=6.0,<=8.0.0
64
+ opentelemetry-api==1.26.0