rumaisa1054 commited on
Commit
93f4205
·
verified ·
1 Parent(s): 006d13e

Upload genaisession.ipynb

Browse files
Files changed (1) hide show
  1. genaisession.ipynb +267 -0
genaisession.ipynb ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "code",
19
+ "source": [
20
+ "pip install faiss-cpu numpy pypdf sentence-transformers\n"
21
+ ],
22
+ "metadata": {
23
+ "colab": {
24
+ "base_uri": "https://localhost:8080/"
25
+ },
26
+ "id": "LqTTG2cy0L1A",
27
+ "outputId": "c8be3a59-e763-47a7-f1de-4a010dae06f4"
28
+ },
29
+ "execution_count": null,
30
+ "outputs": [
31
+ {
32
+ "output_type": "stream",
33
+ "name": "stdout",
34
+ "text": [
35
+ "Requirement already satisfied: faiss-cpu in /usr/local/lib/python3.11/dist-packages (1.10.0)\n",
36
+ "Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (1.26.4)\n",
37
+ "Requirement already satisfied: pypdf in /usr/local/lib/python3.11/dist-packages (5.3.0)\n",
38
+ "Requirement already satisfied: sentence-transformers in /usr/local/lib/python3.11/dist-packages (3.4.1)\n",
39
+ "Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from faiss-cpu) (24.2)\n",
40
+ "Requirement already satisfied: transformers<5.0.0,>=4.41.0 in /usr/local/lib/python3.11/dist-packages (from sentence-transformers) (4.48.3)\n",
41
+ "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from sentence-transformers) (4.67.1)\n",
42
+ "Requirement already satisfied: torch>=1.11.0 in /usr/local/lib/python3.11/dist-packages (from sentence-transformers) (2.5.1+cu124)\n",
43
+ "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.11/dist-packages (from sentence-transformers) (1.6.1)\n",
44
+ "Requirement already satisfied: scipy in /usr/local/lib/python3.11/dist-packages (from sentence-transformers) (1.13.1)\n",
45
+ "Requirement already satisfied: huggingface-hub>=0.20.0 in /usr/local/lib/python3.11/dist-packages (from sentence-transformers) (0.28.1)\n",
46
+ "Requirement already satisfied: Pillow in /usr/local/lib/python3.11/dist-packages (from sentence-transformers) (11.1.0)\n",
47
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (3.17.0)\n",
48
+ "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2024.10.0)\n",
49
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (6.0.2)\n",
50
+ "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2.32.3)\n",
51
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (4.12.2)\n",
52
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers) (3.4.2)\n",
53
+ "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers) (3.1.5)\n",
54
+ "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers) (12.4.127)\n",
55
+ "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers) (12.4.127)\n",
56
+ "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers) (12.4.127)\n",
57
+ "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers) (9.1.0.70)\n",
58
+ "Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers) (12.4.5.8)\n",
59
+ "Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers) (11.2.1.3)\n",
60
+ "Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers) (10.3.5.147)\n",
61
+ "Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers) (11.6.1.9)\n",
62
+ "Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers) (12.3.1.170)\n",
63
+ "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers) (2.21.5)\n",
64
+ "Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers) (12.4.127)\n",
65
+ "Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers) (12.4.127)\n",
66
+ "Requirement already satisfied: triton==3.1.0 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers) (3.1.0)\n",
67
+ "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch>=1.11.0->sentence-transformers) (1.13.1)\n",
68
+ "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch>=1.11.0->sentence-transformers) (1.3.0)\n",
69
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.11/dist-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (2024.11.6)\n",
70
+ "Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.11/dist-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.21.0)\n",
71
+ "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.11/dist-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.5.2)\n",
72
+ "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn->sentence-transformers) (1.4.2)\n",
73
+ "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn->sentence-transformers) (3.5.0)\n",
74
+ "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch>=1.11.0->sentence-transformers) (3.0.2)\n",
75
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (3.4.1)\n",
76
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (3.10)\n",
77
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.3.0)\n",
78
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2025.1.31)\n"
79
+ ]
80
+ }
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": null,
86
+ "metadata": {
87
+ "colab": {
88
+ "base_uri": "https://localhost:8080/"
89
+ },
90
+ "id": "V2T0CkLD0Cnh",
91
+ "outputId": "176443e5-f99f-4d65-c6c5-e1ca43699006"
92
+ },
93
+ "outputs": [
94
+ {
95
+ "output_type": "stream",
96
+ "name": "stdout",
97
+ "text": [
98
+ "Extracting text from PDF...\n",
99
+ "Extracted text (first 500 chars): Machine Learning For Absolute\n",
100
+ "Beginners\n",
101
+ " \n",
102
+ " \n",
103
+ " \n",
104
+ " \n",
105
+ "Oliver Theobald\n",
106
+ " \n",
107
+ " \n",
108
+ " \n",
109
+ " \n",
110
+ " \n",
111
+ "Second Edition\n",
112
+ "Copyright © 2017 by Oliver Theobald\n",
113
+ "All rights reserved. No part of this publication may be reproduced,\n",
114
+ "distributed, or transmitted in any form or by any means, including\n",
115
+ "photocopying, recording, or other electronic or mechanical\n",
116
+ "methods, without the prior written permission of the publisher,\n",
117
+ "except in the case of brief quotations embodied in critical reviews\n",
118
+ "and certain other non-commercial uses permitted b\n",
119
+ "Chunking text...\n",
120
+ "Total chunks created: 53\n",
121
+ "Generating embeddings...\n",
122
+ "Embedding 1/53 generated, Shape: (1, 384)\n",
123
+ "Embedding 2/53 generated, Shape: (1, 384)\n",
124
+ "Embedding 3/53 generated, Shape: (1, 384)\n",
125
+ "Embedding 4/53 generated, Shape: (1, 384)\n",
126
+ "Embedding 5/53 generated, Shape: (1, 384)\n",
127
+ "Embedding 6/53 generated, Shape: (1, 384)\n",
128
+ "Embedding 7/53 generated, Shape: (1, 384)\n",
129
+ "Embedding 8/53 generated, Shape: (1, 384)\n",
130
+ "Embedding 9/53 generated, Shape: (1, 384)\n",
131
+ "Embedding 10/53 generated, Shape: (1, 384)\n",
132
+ "Embedding 11/53 generated, Shape: (1, 384)\n",
133
+ "Embedding 12/53 generated, Shape: (1, 384)\n",
134
+ "Embedding 13/53 generated, Shape: (1, 384)\n",
135
+ "Embedding 14/53 generated, Shape: (1, 384)\n",
136
+ "Embedding 15/53 generated, Shape: (1, 384)\n",
137
+ "Embedding 16/53 generated, Shape: (1, 384)\n",
138
+ "Embedding 17/53 generated, Shape: (1, 384)\n",
139
+ "Embedding 18/53 generated, Shape: (1, 384)\n",
140
+ "Embedding 19/53 generated, Shape: (1, 384)\n",
141
+ "Embedding 20/53 generated, Shape: (1, 384)\n",
142
+ "Embedding 21/53 generated, Shape: (1, 384)\n",
143
+ "Embedding 22/53 generated, Shape: (1, 384)\n",
144
+ "Embedding 23/53 generated, Shape: (1, 384)\n",
145
+ "Embedding 24/53 generated, Shape: (1, 384)\n",
146
+ "Embedding 25/53 generated, Shape: (1, 384)\n",
147
+ "Embedding 26/53 generated, Shape: (1, 384)\n",
148
+ "Embedding 27/53 generated, Shape: (1, 384)\n",
149
+ "Embedding 28/53 generated, Shape: (1, 384)\n",
150
+ "Embedding 29/53 generated, Shape: (1, 384)\n",
151
+ "Embedding 30/53 generated, Shape: (1, 384)\n",
152
+ "Embedding 31/53 generated, Shape: (1, 384)\n",
153
+ "Embedding 32/53 generated, Shape: (1, 384)\n",
154
+ "Embedding 33/53 generated, Shape: (1, 384)\n",
155
+ "Embedding 34/53 generated, Shape: (1, 384)\n",
156
+ "Embedding 35/53 generated, Shape: (1, 384)\n",
157
+ "Embedding 36/53 generated, Shape: (1, 384)\n",
158
+ "Embedding 37/53 generated, Shape: (1, 384)\n",
159
+ "Embedding 38/53 generated, Shape: (1, 384)\n",
160
+ "Embedding 39/53 generated, Shape: (1, 384)\n",
161
+ "Embedding 40/53 generated, Shape: (1, 384)\n",
162
+ "Embedding 41/53 generated, Shape: (1, 384)\n",
163
+ "Embedding 42/53 generated, Shape: (1, 384)\n",
164
+ "Embedding 43/53 generated, Shape: (1, 384)\n",
165
+ "Embedding 44/53 generated, Shape: (1, 384)\n",
166
+ "Embedding 45/53 generated, Shape: (1, 384)\n",
167
+ "Embedding 46/53 generated, Shape: (1, 384)\n",
168
+ "Embedding 47/53 generated, Shape: (1, 384)\n",
169
+ "Embedding 48/53 generated, Shape: (1, 384)\n",
170
+ "Embedding 49/53 generated, Shape: (1, 384)\n",
171
+ "Embedding 50/53 generated, Shape: (1, 384)\n",
172
+ "Embedding 51/53 generated, Shape: (1, 384)\n",
173
+ "Embedding 52/53 generated, Shape: (1, 384)\n",
174
+ "Embedding 53/53 generated, Shape: (1, 384)\n",
175
+ "Storing in FAISS...\n",
176
+ "FAISS database saved as 'vector_database.faiss'\n"
177
+ ]
178
+ }
179
+ ],
180
+ "source": [
181
+ "import os\n",
182
+ "import faiss\n",
183
+ "import numpy as np\n",
184
+ "import pypdf # Using pypdf for text extraction\n",
185
+ "from sentence_transformers import SentenceTransformer\n",
186
+ "\n",
187
+ "# Load an open-source embedding model from Hugging Face\n",
188
+ "model = SentenceTransformer(\"sentence-transformers/all-MiniLM-L6-v2\")\n",
189
+ "\n",
190
+ "# Load text from PDF using pypdf\n",
191
+ "def load_pdf(pdf_path):\n",
192
+ " text = \"\"\n",
193
+ " with open(pdf_path, \"rb\") as file:\n",
194
+ " reader = pypdf.PdfReader(file)\n",
195
+ " for page in reader.pages:\n",
196
+ " text += page.extract_text() + \"\\n\" if page.extract_text() else \"\" # Handle empty pages\n",
197
+ " return text.strip() if text.strip() else None # Ensure non-empty text\n",
198
+ "\n",
199
+ "# Split text into chunks\n",
200
+ "def chunk_text(text, chunk_size=500):\n",
201
+ " words = text.split()\n",
202
+ " chunks = [\" \".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]\n",
203
+ " return [c for c in chunks if c.strip()] # Remove empty chunks\n",
204
+ "\n",
205
+ "# Generate embeddings using Hugging Face model\n",
206
+ "def get_embedding(text):\n",
207
+ " return model.encode(text, convert_to_numpy=True).reshape(1, -1) # Ensure 2D shape\n",
208
+ "\n",
209
+ "# Store embeddings in FAISS\n",
210
+ "def store_in_faiss(embeddings):\n",
211
+ " if len(embeddings) == 0:\n",
212
+ " raise ValueError(\"No embeddings found! Check your text extraction and chunking.\")\n",
213
+ "\n",
214
+ " embeddings = np.vstack(embeddings) # Stack into 2D array\n",
215
+ " dim = embeddings.shape[1]\n",
216
+ " index = faiss.IndexFlatL2(dim)\n",
217
+ " index.add(embeddings)\n",
218
+ " faiss.write_index(index, \"vector_database.faiss\")\n",
219
+ "\n",
220
+ "def main():\n",
221
+ " pdf_path = \"/content/[Oliver_Theobald]_Machine_Learning_for_Absolute_Be.pdf\"\n",
222
+ "\n",
223
+ " print(\"Extracting text from PDF...\")\n",
224
+ " text = load_pdf(pdf_path)\n",
225
+ " if text is None:\n",
226
+ " raise ValueError(\"No text extracted from PDF. Check if it's a scanned document!\")\n",
227
+ "\n",
228
+ " print(\"Extracted text (first 500 chars):\", text[:500])\n",
229
+ "\n",
230
+ " print(\"Chunking text...\")\n",
231
+ " chunks = chunk_text(text)\n",
232
+ " print(f\"Total chunks created: {len(chunks)}\")\n",
233
+ " if not chunks:\n",
234
+ " raise ValueError(\"No valid text chunks found!\")\n",
235
+ "\n",
236
+ " print(\"Generating embeddings...\")\n",
237
+ " embeddings = []\n",
238
+ " for i, chunk in enumerate(chunks):\n",
239
+ " emb = get_embedding(chunk)\n",
240
+ " print(f\"Embedding {i+1}/{len(chunks)} generated, Shape: {emb.shape}\")\n",
241
+ " embeddings.append(emb)\n",
242
+ "\n",
243
+ " if not embeddings:\n",
244
+ " raise ValueError(\"No embeddings were generated! Check the text chunks.\")\n",
245
+ "\n",
246
+ " embeddings = np.vstack(embeddings)\n",
247
+ "\n",
248
+ " print(\"Storing in FAISS...\")\n",
249
+ " store_in_faiss(embeddings)\n",
250
+ "\n",
251
+ " print(\"FAISS database saved as 'vector_database.faiss'\")\n",
252
+ "\n",
253
+ "if __name__ == \"__main__\":\n",
254
+ " main()"
255
+ ]
256
+ },
257
+ {
258
+ "cell_type": "code",
259
+ "source": [],
260
+ "metadata": {
261
+ "id": "4FyvMg221DIg"
262
+ },
263
+ "execution_count": null,
264
+ "outputs": []
265
+ }
266
+ ]
267
+ }