finchat / scrape_data.py
Monsia's picture
fix: scraper bug
782be8b
from datetime import date, timedelta
import bs4
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import LocalFileStore
from langchain.storage._lc_store import create_kv_docstore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import config
DATA_URL = "https://www.sikafinance.com/marches/actualites_bourse_brvm"
embeddings_model = GoogleGenerativeAIEmbeddings(
model=config.GOOGLE_EMBEDDING_MODEL
) # type: ignore
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)
def scrap_articles(
url="https://www.sikafinance.com/marches/actualites_bourse_brvm", num_days_past=5
):
today = date.today()
driver.get(url)
all_articles = []
for i in range(num_days_past + 1):
past_date = today - timedelta(days=i)
date_str = past_date.strftime("%Y-%m-%d")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "dateActu"))
)
text_box = driver.find_element(By.ID, "dateActu")
text_box.send_keys(date_str)
submit_btn = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.ID, "btn"))
)
submit_btn.click()
dates = driver.find_elements(By.CLASS_NAME, "sp1")
table = driver.find_element(By.ID, "tabQuotes")
titles = table.find_elements(By.TAG_NAME, "a")
articles = []
for i in range(len(titles)):
art = {
"title": titles[i].text.strip(),
"date": dates[i].text,
"link": titles[i].get_attribute("href"),
}
articles.append(art)
all_articles += articles
# driver.quit()
return all_articles
def set_metadata(documents, metadatas):
"""
#Edit a metadata of lanchain Documents object
"""
for doc in documents:
idx = documents.index(doc)
doc.metadata = metadatas[idx]
print("Metadata successfully changed")
print(documents[0].metadata)
def process_docs(
articles, persist_directory, embeddings_model, chunk_size=500, chunk_overlap=0
):
"""
#Scrap all articles urls content and save on a vector DB
"""
article_urls = [a["link"] for a in articles]
print("Starting to scrap ..")
loader = WebBaseLoader(
web_paths=article_urls,
bs_kwargs=dict(
parse_only=bs4.SoupStrainer(
class_=("inarticle txtbig", "dt_sign", "innerUp")
)
),
)
print("After scraping Loading ..")
docs = loader.load()
# Update metadata: add title,
set_metadata(documents=docs, metadatas=articles)
# print("Successfully loaded to document")
# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n"]
)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
persist_directory=persist_directory + "vectorstore",
collection_name="full_documents",
embedding_function=embeddings_model,
)
# The storage layer for the parent documents
fs = LocalFileStore(persist_directory + "docstore")
store = create_kv_docstore(fs)
retriever = ParentDocumentRetriever(
vectorstore=vectorstore,
docstore=store,
child_splitter=child_splitter,
)
retriever.add_documents(docs, ids=None)
print(len(docs), " documents added")
if __name__ == "__main__":
data = scrap_articles(DATA_URL, num_days_past=config.NUM_DAYS_PAST)
process_docs(data, config.STORAGE_PATH, embeddings_model)