In [None]:
pip install faiss-cpu numpy pypdf sentence-transformers




In [None]:
import os
import faiss
import numpy as np
import pypdf # Using pypdf for text extraction
from sentence_transformers import SentenceTransformer

# Load an open-source embedding model from Hugging Face
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Load text from PDF using pypdf
def load_pdf(pdf_path):
 text = ""
 with open(pdf_path, "rb") as file:
 reader = pypdf.PdfReader(file)
 for page in reader.pages:
 text += page.extract_text() + "\n" if page.extract_text() else "" # Handle empty pages
 return text.strip() if text.strip() else None # Ensure non-empty text

# Split text into chunks
def chunk_text(text, chunk_size=500):
 words = text.split()
 chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
 return [c for c in chunks if c.strip()] # Remove empty chunks

# Generate embeddings using Hugging Face model
def get_embedding(text):
 return model.encode(text, convert_to_numpy=True).reshape(1, -1) # Ensure 2D shape

# Store embeddings in FAISS
def store_in_faiss(embeddings):
 if len(embeddings) == 0:
 raise ValueError("No embeddings found! Check your text extraction and chunking.")

 embeddings = np.vstack(embeddings) # Stack into 2D array
 dim = embeddings.shape[1]
 index = faiss.IndexFlatL2(dim)
 index.add(embeddings)
 faiss.write_index(index, "vector_database.faiss")

def main():
 pdf_path = "/content/[Oliver_Theobald]_Machine_Learning_for_Absolute_Be.pdf"

 print("Extracting text from PDF...")
 text = load_pdf(pdf_path)
 if text is None:
 raise ValueError("No text extracted from PDF. Check if it's a scanned document!")

 print("Extracted text (first 500 chars):", text[:500])

 print("Chunking text...")
 chunks = chunk_text(text)
 print(f"Total chunks created: {len(chunks)}")
 if not chunks:
 raise ValueError("No valid text chunks found!")

 print("Generating embeddings...")
 embeddings = []
 for i, chunk in enumerate(chunks):
 emb = get_embedding(chunk)
 print(f"Embedding {i+1}/{len(chunks)} generated, Shape: {emb.shape}")
 embeddings.append(emb)

 if not embeddings:
 raise ValueError("No embeddings were generated! Check the text chunks.")

 embeddings = np.vstack(embeddings)

 print("Storing in FAISS...")
 store_in_faiss(embeddings)

 print("FAISS database saved as 'vector_database.faiss'")

if __name__ == "__main__":
 main()

Extracting text from PDF...
Extracted text (first 500 chars): Machine Learning For Absolute
Beginners
 
 
 
 
Oliver Theobald
 
 
 
 
 
Second Edition
Copyright © 2017 by Oliver Theobald
All rights reserved. No part of this publication may be reproduced,
distributed, or transmitted in any form or by any means, including
photocopying, recording, or other electronic or mechanical
methods, without the prior written permission of the publisher,
except in the case of brief quotations embodied in critical reviews
and certain other non-commercial uses permitted b
Chunking text...
Total chunks created: 53
Generating embeddings...
Embedding 1/53 generated, Shape: (1, 384)
Embedding 2/53 generated, Shape: (1, 384)
Embedding 3/53 generated, Shape: (1, 384)
Embedding 4/53 generated, Shape: (1, 384)
Embedding 5/53 generated, Shape: (1, 384)
Embedding 6/53 generated, Shape: (1, 384)
Embedding 7/53 generated, Shape: (1, 384)
Embedding 8/53 generated, Shape: (1, 384)
Embedding 9/53 generated, Shape: (