|
from datetime import date, timedelta |
|
|
|
import bs4 |
|
from langchain.retrievers import ParentDocumentRetriever |
|
from langchain.storage import LocalFileStore |
|
from langchain.storage._lc_store import create_kv_docstore |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.vectorstores.chroma import Chroma |
|
from langchain_community.document_loaders import WebBaseLoader |
|
from langchain_google_genai import GoogleGenerativeAIEmbeddings |
|
from selenium import webdriver |
|
from selenium.webdriver.common.by import By |
|
from selenium.webdriver.support import expected_conditions as EC |
|
from selenium.webdriver.support.ui import WebDriverWait |
|
|
|
import config |
|
|
|
DATA_URL = "https://www.sikafinance.com/marches/actualites_bourse_brvm" |
|
|
|
embeddings_model = GoogleGenerativeAIEmbeddings( |
|
model=config.GOOGLE_EMBEDDING_MODEL |
|
) |
|
|
|
|
|
options = webdriver.ChromeOptions() |
|
options.add_argument("--headless") |
|
options.add_argument("--no-sandbox") |
|
options.add_argument("--disable-dev-shm-usage") |
|
driver = webdriver.Chrome(options=options) |
|
|
|
|
|
def scrap_articles( |
|
url="https://www.sikafinance.com/marches/actualites_bourse_brvm", num_days_past=5 |
|
): |
|
|
|
today = date.today() |
|
|
|
driver.get(url) |
|
|
|
all_articles = [] |
|
for i in range(num_days_past + 1): |
|
past_date = today - timedelta(days=i) |
|
date_str = past_date.strftime("%Y-%m-%d") |
|
WebDriverWait(driver, 10).until( |
|
EC.presence_of_element_located((By.ID, "dateActu")) |
|
) |
|
text_box = driver.find_element(By.ID, "dateActu") |
|
text_box.send_keys(date_str) |
|
|
|
submit_btn = WebDriverWait(driver, 10).until( |
|
EC.element_to_be_clickable((By.ID, "btn")) |
|
) |
|
submit_btn.click() |
|
|
|
dates = driver.find_elements(By.CLASS_NAME, "sp1") |
|
table = driver.find_element(By.ID, "tabQuotes") |
|
titles = table.find_elements(By.TAG_NAME, "a") |
|
articles = [] |
|
for i in range(len(titles)): |
|
art = { |
|
"title": titles[i].text.strip(), |
|
"date": dates[i].text, |
|
"link": titles[i].get_attribute("href"), |
|
} |
|
articles.append(art) |
|
|
|
all_articles += articles |
|
|
|
|
|
return all_articles |
|
|
|
|
|
def set_metadata(documents, metadatas): |
|
""" |
|
#Edit a metadata of lanchain Documents object |
|
""" |
|
for doc in documents: |
|
idx = documents.index(doc) |
|
doc.metadata = metadatas[idx] |
|
print("Metadata successfully changed") |
|
print(documents[0].metadata) |
|
|
|
|
|
def process_docs( |
|
articles, persist_directory, embeddings_model, chunk_size=500, chunk_overlap=0 |
|
): |
|
""" |
|
#Scrap all articles urls content and save on a vector DB |
|
""" |
|
article_urls = [a["link"] for a in articles] |
|
|
|
print("Starting to scrap ..") |
|
|
|
loader = WebBaseLoader( |
|
web_paths=article_urls, |
|
bs_kwargs=dict( |
|
parse_only=bs4.SoupStrainer( |
|
class_=("inarticle txtbig", "dt_sign", "innerUp") |
|
) |
|
), |
|
) |
|
|
|
print("After scraping Loading ..") |
|
docs = loader.load() |
|
|
|
|
|
set_metadata(documents=docs, metadatas=articles) |
|
|
|
|
|
|
|
|
|
child_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n"] |
|
) |
|
|
|
|
|
vectorstore = Chroma( |
|
persist_directory=persist_directory + "vectorstore", |
|
collection_name="full_documents", |
|
embedding_function=embeddings_model, |
|
) |
|
|
|
|
|
fs = LocalFileStore(persist_directory + "docstore") |
|
store = create_kv_docstore(fs) |
|
|
|
retriever = ParentDocumentRetriever( |
|
vectorstore=vectorstore, |
|
docstore=store, |
|
child_splitter=child_splitter, |
|
) |
|
|
|
retriever.add_documents(docs, ids=None) |
|
print(len(docs), " documents added") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
data = scrap_articles(DATA_URL, num_days_past=config.NUM_DAYS_PAST) |
|
process_docs(data, config.STORAGE_PATH, embeddings_model) |
|
|