Spaces:

Saim-11
/

Youtube-video-chatbot

Sleeping

App Files Files Community

Youtube-video-chatbot / app.py

Saim-11

Update app.py

f95208e verified 10 months ago

raw

history blame contribute delete

3.78 kB

	import gradio as gr
	from langchain_community.vectorstores import Qdrant
	from langchain_community.embeddings import HuggingFaceBgeEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from pytubefix import YouTube
	from qdrant_client import QdrantClient
	from langchain_groq import ChatGroq
	import re

	# Function to extract the transcript text
	def get_text(video_id):
	yt = YouTube(video_id)
	caption = yt.captions.get_by_language_code('en')
	transcript = caption.generate_srt_captions()

	# Split the transcript into lines
	lines = transcript.splitlines()

	# Extract text from every third line (lines 3, 6, 9, ...)
	extracted_text = " ".join(lines[i] for i in range(2, len(lines), 4))

	return extracted_text

	# Function to create the Qdrant database
	def create_qdrant_database(url):
	text = get_text(url)

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=10000,
	chunk_overlap=1000
	)

	docs = text_splitter.split_text(text)

	model_name = 'BAAI/bge-large-en'
	model_kwargs = {'device': 'cpu'}
	encode_kwargs = {'normalize_embeddings': False}
	embeddings = HuggingFaceBgeEmbeddings(
	model_name=model_name,
	model_kwargs=model_kwargs,
	encode_kwargs=encode_kwargs
	)

	collection_name = "Youtube_Videos"

	qdrant_url = "https://ec1c2790-c2e2-4c78-943f-5f9772492b2e.europe-west3-0.gcp.cloud.qdrant.io:6333"
	api_key = "zIUUg_1QTtjSmCLNEpKnxJZeedKuh635c-YgGkDbI5EJ0ITjpOSyqw"

	qdrant = Qdrant.from_texts(
	texts=docs,
	embedding=embeddings,
	url=qdrant_url,
	prefer_grpc=False,
	collection_name=collection_name,
	api_key=api_key,
	timeout=50
	)

	return "Qdrant database created"

	# Function to answer questions based on the created Qdrant database
	def get_answer(question):
	qdrant_url = "https://ec1c2790-c2e2-4c78-943f-5f9772492b2e.europe-west3-0.gcp.cloud.qdrant.io:6333"
	api_key = "zIUUg_1QTtjSmCLNEpKnxJZeedKuh635c-YgGkDbI5EJ0ITjpOSyqw"

	# Initialize the embeddings and Qdrant client
	model_name = 'BAAI/bge-large-en'
	model_kwargs = {'device': 'cpu'}
	encode_kwargs = {'normalize_embeddings': False}
	embeddings = HuggingFaceBgeEmbeddings(
	model_name=model_name,
	model_kwargs=model_kwargs,
	encode_kwargs=encode_kwargs
	)

	client = QdrantClient(
	url=qdrant_url,
	prefer_grpc=False,
	api_key=api_key,
	timeout=50
	)
	collection_name = "Youtube_Videos"
	db = Qdrant(
	client=client,
	embeddings=embeddings,
	collection_name=collection_name,

	)

	# Initialize ChatGroq model
	api_key = "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"
	model_name = "llama-3.1-70b-versatile"
	model = ChatGroq(api_key=api_key, model=model_name, temperature=0)

	# Search for the relevant document and generate the answer
	docs = db.similarity_search_with_score(query=question, k=1)
	for doc, score in docs:
	return model.invoke(f"{question} : {doc.page_content}")

	# Gradio Interface
	with gr.Blocks() as demo:
	with gr.Row():
	with gr.Column():
	url_input = gr.Textbox(label="YouTube Video URL")
	output_text = gr.Textbox(label="Result")
	run_button = gr.Button("Create Qdrant Database")
	run_button.click(fn=create_qdrant_database, inputs=url_input, outputs=output_text)

	with gr.Column():
	question_input = gr.Textbox(label="Ask a Question")
	answer_output = gr.Textbox(label="Answer")
	ask_button = gr.Button("Get Answer")
	ask_button.click(fn=get_answer, inputs=question_input, outputs=answer_output)

	demo.launch()