Saim-11's picture
Update app.py
f95208e verified
import gradio as gr
from langchain_community.vectorstores import Qdrant
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pytubefix import YouTube
from qdrant_client import QdrantClient
from langchain_groq import ChatGroq
import re
# Function to extract the transcript text
def get_text(video_id):
yt = YouTube(video_id)
caption = yt.captions.get_by_language_code('en')
transcript = caption.generate_srt_captions()
# Split the transcript into lines
lines = transcript.splitlines()
# Extract text from every third line (lines 3, 6, 9, ...)
extracted_text = " ".join(lines[i] for i in range(2, len(lines), 4))
return extracted_text
# Function to create the Qdrant database
def create_qdrant_database(url):
text = get_text(url)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=10000,
chunk_overlap=1000
)
docs = text_splitter.split_text(text)
model_name = 'BAAI/bge-large-en'
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceBgeEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
collection_name = "Youtube_Videos"
qdrant_url = "https://ec1c2790-c2e2-4c78-943f-5f9772492b2e.europe-west3-0.gcp.cloud.qdrant.io:6333"
api_key = "zIUUg_1QTtjSmCLNEpKnxJZeedKuh635c-YgGkDbI5EJ0ITjpOSyqw"
qdrant = Qdrant.from_texts(
texts=docs,
embedding=embeddings,
url=qdrant_url,
prefer_grpc=False,
collection_name=collection_name,
api_key=api_key,
timeout=50
)
return "Qdrant database created"
# Function to answer questions based on the created Qdrant database
def get_answer(question):
qdrant_url = "https://ec1c2790-c2e2-4c78-943f-5f9772492b2e.europe-west3-0.gcp.cloud.qdrant.io:6333"
api_key = "zIUUg_1QTtjSmCLNEpKnxJZeedKuh635c-YgGkDbI5EJ0ITjpOSyqw"
# Initialize the embeddings and Qdrant client
model_name = 'BAAI/bge-large-en'
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceBgeEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
client = QdrantClient(
url=qdrant_url,
prefer_grpc=False,
api_key=api_key,
timeout=50
)
collection_name = "Youtube_Videos"
db = Qdrant(
client=client,
embeddings=embeddings,
collection_name=collection_name,
)
# Initialize ChatGroq model
api_key = "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"
model_name = "llama-3.1-70b-versatile"
model = ChatGroq(api_key=api_key, model=model_name, temperature=0)
# Search for the relevant document and generate the answer
docs = db.similarity_search_with_score(query=question, k=1)
for doc, score in docs:
return model.invoke(f"{question} : {doc.page_content}")
# Gradio Interface
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
url_input = gr.Textbox(label="YouTube Video URL")
output_text = gr.Textbox(label="Result")
run_button = gr.Button("Create Qdrant Database")
run_button.click(fn=create_qdrant_database, inputs=url_input, outputs=output_text)
with gr.Column():
question_input = gr.Textbox(label="Ask a Question")
answer_output = gr.Textbox(label="Answer")
ask_button = gr.Button("Get Answer")
ask_button.click(fn=get_answer, inputs=question_input, outputs=answer_output)
demo.launch()