Spaces:
Sleeping
Sleeping
import gradio as gr | |
from langchain_community.vectorstores import Qdrant | |
from langchain_community.embeddings import HuggingFaceBgeEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from pytubefix import YouTube | |
from qdrant_client import QdrantClient | |
from langchain_groq import ChatGroq | |
import re | |
# Function to extract the transcript text | |
def get_text(video_id): | |
yt = YouTube(video_id) | |
caption = yt.captions.get_by_language_code('en') | |
transcript = caption.generate_srt_captions() | |
# Split the transcript into lines | |
lines = transcript.splitlines() | |
# Extract text from every third line (lines 3, 6, 9, ...) | |
extracted_text = " ".join(lines[i] for i in range(2, len(lines), 4)) | |
return extracted_text | |
# Function to create the Qdrant database | |
def create_qdrant_database(url): | |
text = get_text(url) | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=10000, | |
chunk_overlap=1000 | |
) | |
docs = text_splitter.split_text(text) | |
model_name = 'BAAI/bge-large-en' | |
model_kwargs = {'device': 'cpu'} | |
encode_kwargs = {'normalize_embeddings': False} | |
embeddings = HuggingFaceBgeEmbeddings( | |
model_name=model_name, | |
model_kwargs=model_kwargs, | |
encode_kwargs=encode_kwargs | |
) | |
collection_name = "Youtube_Videos" | |
qdrant_url = "https://ec1c2790-c2e2-4c78-943f-5f9772492b2e.europe-west3-0.gcp.cloud.qdrant.io:6333" | |
api_key = "zIUUg_1QTtjSmCLNEpKnxJZeedKuh635c-YgGkDbI5EJ0ITjpOSyqw" | |
qdrant = Qdrant.from_texts( | |
texts=docs, | |
embedding=embeddings, | |
url=qdrant_url, | |
prefer_grpc=False, | |
collection_name=collection_name, | |
api_key=api_key, | |
timeout=50 | |
) | |
return "Qdrant database created" | |
# Function to answer questions based on the created Qdrant database | |
def get_answer(question): | |
qdrant_url = "https://ec1c2790-c2e2-4c78-943f-5f9772492b2e.europe-west3-0.gcp.cloud.qdrant.io:6333" | |
api_key = "zIUUg_1QTtjSmCLNEpKnxJZeedKuh635c-YgGkDbI5EJ0ITjpOSyqw" | |
# Initialize the embeddings and Qdrant client | |
model_name = 'BAAI/bge-large-en' | |
model_kwargs = {'device': 'cpu'} | |
encode_kwargs = {'normalize_embeddings': False} | |
embeddings = HuggingFaceBgeEmbeddings( | |
model_name=model_name, | |
model_kwargs=model_kwargs, | |
encode_kwargs=encode_kwargs | |
) | |
client = QdrantClient( | |
url=qdrant_url, | |
prefer_grpc=False, | |
api_key=api_key, | |
timeout=50 | |
) | |
collection_name = "Youtube_Videos" | |
db = Qdrant( | |
client=client, | |
embeddings=embeddings, | |
collection_name=collection_name, | |
) | |
# Initialize ChatGroq model | |
api_key = "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" | |
model_name = "llama-3.1-70b-versatile" | |
model = ChatGroq(api_key=api_key, model=model_name, temperature=0) | |
# Search for the relevant document and generate the answer | |
docs = db.similarity_search_with_score(query=question, k=1) | |
for doc, score in docs: | |
return model.invoke(f"{question} : {doc.page_content}") | |
# Gradio Interface | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
with gr.Column(): | |
url_input = gr.Textbox(label="YouTube Video URL") | |
output_text = gr.Textbox(label="Result") | |
run_button = gr.Button("Create Qdrant Database") | |
run_button.click(fn=create_qdrant_database, inputs=url_input, outputs=output_text) | |
with gr.Column(): | |
question_input = gr.Textbox(label="Ask a Question") | |
answer_output = gr.Textbox(label="Answer") | |
ask_button = gr.Button("Get Answer") | |
ask_button.click(fn=get_answer, inputs=question_input, outputs=answer_output) | |
demo.launch() | |